In [116]:
#关闭警告信息
import warnings
warnings.filterwarnings('ignore')

读取数据并对数据进行必要的检查,包括缺失值、数据类型。

In [2]:
#读取数据
import pandas as pd
Data=pd.read_pickle('DNA methylation data/RA DNA methylation/RA_Methylation_Data.pkl')
Data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 6425 entries, 7462 to 7002
Columns: 100008 entries, sample_id to DiseaseEncoder
dtypes: float64(100002), int64(2), object(4)
memory usage: 4.8+ GB
In [3]:
#检查数据空缺值
print(Data.isnull().sum().sum())
136
In [5]:
#处理空缺数据
Data['gender'].fillna('M',inplace=True)
Data['GenderEncoder'].fillna(0,inplace=True)
In [6]:
#检查空缺数据处理结果
print(Data.isnull().sum().sum())
0
In [7]:
#查看数据结构
Data.head(10)
Out[7]:
sample_id cg00050873 cg00212031 cg00213748 cg00214611 cg00455876 cg01707559 cg02004872 cg02011394 cg02050847 ... cg12794168 cg12799119 cg12848808 age gender sample_type disease GenderEncoder sample_type_encoder DiseaseEncoder
7462 train17463 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.482415 -3.938986 -1.687774 39.0 F disease tissue rheumatoid arthritis 1.0 1 1
7463 train17464 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 0.890330 -3.619579 -1.672671 28.0 F disease tissue rheumatoid arthritis 1.0 1 1
7464 train17465 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 2.049755 -3.886935 -2.100192 68.0 F disease tissue rheumatoid arthritis 1.0 1 1
7465 train17466 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 0.832866 -4.112908 -2.324893 30.0 F disease tissue rheumatoid arthritis 1.0 1 1
7466 train17467 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.564056 -3.701353 -1.814692 69.0 F disease tissue rheumatoid arthritis 1.0 1 1
7467 train17468 1.687774 -4.247583 0.0 -3.744756 0.0 -2.185284 -3.314031 2.570129 3.744756 ... 1.482415 -3.580953 -1.982282 40.0 M disease tissue rheumatoid arthritis 0.0 1 1
7468 train17469 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.247949 -3.886935 -2.069693 47.0 F disease tissue rheumatoid arthritis 1.0 1 1
7469 train17470 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.790011 -4.051632 -1.918093 53.0 F disease tissue rheumatoid arthritis 1.0 1 1
7470 train17471 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.348848 -4.051632 -1.936120 62.0 F disease tissue rheumatoid arthritis 1.0 1 1
7471 train17472 0.000000 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 ... 1.300641 -4.322159 -1.848299 49.0 F disease tissue rheumatoid arthritis 1.0 1 1

10 rows × 100008 columns

使用PCA算法计算保留95%和99%信息的维度,并将数据降维至3维可视化数据分布。

In [9]:
#提取甲基化数据并进行PCA处理
Methylation=Data.iloc[:,1:-7]
print(Methylation.shape)
(6425, 100000)
In [8]:
#执行PCA算法-计算保留99%信息时的维度
from sklearn.decomposition import PCA
Methylaion_PCA=PCA(n_components=0.99)
PCA_Methylation_99=Methylaion_PCA.fit_transform(Methylation)    #数据降维
print(PCA_Methylation_99.shape)
(6425, 3981)
In [8]:
#执行PCA算法-计算保留99%信息时的维度
from sklearn.decomposition import PCA
Methylaion_PCA=PCA(n_components=0.95)
PCA_Methylation_95=Methylaion_PCA.fit_transform(Methylation)    #数据降维
print(PCA_Methylation_95.shape)
(6425, 2514)
In [9]:
#计算可解释方差比
import numpy as np
import matplotlib.pyplot as plt
Explain_variance=np.cumsum(Methylaion_PCA.explained_variance_ratio_)
#绘图
Explain_variance_plt=plt.figure(dpi=300)
Epl_variance=Explain_variance_plt.add_subplot(111)
Epl_variance.set_title('Explainable variance ratio curve')
Epl_variance.grid(color='black',linestyle='-.',alpha=0.2)
Epl_variance.plot(np.arange(1,len(Explain_variance)+1,1,dtype=int),Explain_variance,color='orange',linestyle='-')
#添加95%可解释方差曲线
Epl_variance.scatter(x=2514,y=0.95,color='green',marker='o')
Epl_variance.axhline(y=0.95, color='green', linestyle='-.')
Epl_variance.axvline(x=2514, color='green', linestyle='-.', label='Retain 95 per cent of explainable variance')
#添加99%可解释方差曲线
Epl_variance.scatter(3981,0.99,color='red',marker='o')
Epl_variance.axhline(y=0.99, color='red', linestyle='-.')
Epl_variance.axvline(x=3981, color='red', linestyle='-.', label='Retain 99 per cent of explainable variance')
Epl_variance.set_xlabel('n_components')
Epl_variance.set_ylabel('explained_variance_ratio')
plt.legend()
plt.show()
No description has been provided for this image

特征选择:执行低方差过滤、高相关过滤和Lossa回归进行特征选择。

In [10]:
#对数据采取低方差过滤算法
from pandas import DataFrame
from sklearn.feature_selection import VarianceThreshold
Variance_Selector=VarianceThreshold(threshold=0.5)
VarSele_Data=Variance_Selector.fit_transform(Methylation)    #执行低方差过滤
VarSele_Data=DataFrame(VarSele_Data)
VarSele_Data.columns=Variance_Selector.get_feature_names_out()
VarSele_Data.shape
Out[10]:
(6425, 63403)
In [11]:
#构建特征方差数据
VarianseData=DataFrame()
VarianseData['MetaBolite']=Variance_Selector.feature_names_in_
VarianseData['Variances']=Variance_Selector.variances_
VarianseData.shape
Out[11]:
(100000, 2)
In [12]:
#输出为Eecel文件
VarianseData.to_excel('/mnt/workspace/Analysis Data/VarianseData.xlsx','UTF-8')
In [13]:
#执行基于F分布的方差分析进行高相关过滤
from sklearn.feature_selection import SelectKBest, f_classif
Selectk=SelectKBest(score_func=f_classif,k=3981)    #保留99%可解释信息
FselectorData=Selectk.fit_transform(VarSele_Data,Data.loc[:,'DiseaseEncoder'])
FselectorData=pd.DataFrame(FselectorData)
feature_nameindex=Selectk.get_support(indices=True)    #获取特征索引
feature_names=VarSele_Data.columns    #获取特征名
Kfeature_names=[feature_names[i] for i in feature_nameindex]
FselectorData.columns=Kfeature_names
FselectorData.shape
Out[13]:
(6425, 3981)
In [14]:
#获取相关系数信息
KBestF_info=DataFrame()
KBestF_info['Feature']=Kfeature_names    #载入特征
KBestF_info['scores']=Selectk.scores_[:3981]
KBestF_info['P value']=Selectk.pvalues_[:3981]
KBestF_info.shape
Out[14]:
(3981, 3)
In [15]:
#输出相关系数数据
KBestF_info.to_excel('/mnt/workspace/Analysis Data/CorrData.xlsx','UTF-8')
In [16]:
#采用Lasso算法进行特征选择
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
Target=Data.loc[:,'DiseaseEncoder']
# 数据标准化  
scaler=StandardScaler()  
X_scaled=scaler.fit_transform(FselectorData)
# 拆分数据集  
X_train,X_test,y_train,y_test=train_test_split(X_scaled,Target, test_size=0.3, random_state=2024)
lasso_model = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000)    # 设置Lasso模型 
lasso_model.fit(X_train,y_train)    # 训练模型  
Out[16]:
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
                   solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
                   solver='liblinear')
In [17]:
#评估Lasso模型性能
print('Lasso模型训练集Accuracy为:',accuracy_score(y_train,lasso_model.predict(X_train)))
print('Lasso模型测试集Accuracy为:',accuracy_score(y_test,lasso_model.predict(X_test)))
Lasso模型训练集Accuracy为: 0.9968868134311764
Lasso模型测试集Accuracy为: 0.9942946058091287
In [18]:
#查看模型混淆矩阵
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
    ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1])    #计算混淆举证
    Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
    Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
    plt.title('Confusion Matrix')
    plt.show()
In [19]:
Display_ConfusionMatrix(lasso_model,X_train,y_train)    #训练集
No description has been provided for this image
In [20]:
Display_ConfusionMatrix(lasso_model,X_test,y_test)   #测试集
No description has been provided for this image
In [21]:
#提取特征并进行特征选择
import numpy as np
LassoFeature_Index=np.where(lasso_model.coef_!=0)[1]
LassoFeature=FselectorData.iloc[:,LassoFeature_Index]
LassoFeature.shape
Out[21]:
(6425, 59)
In [22]:
#输出lossa模型
import joblib
joblib.dump(lasso_model,'/mnt/workspace/Analysis Model/LossaModel.pkl')
Out[22]:
['/mnt/workspace/Analysis Model/LossaModel.pkl']

使用Lasso验证DNA甲基化位点特征选择后的结果.

In [23]:
#使用特征选择后的模型验证模型精度-输入[batchs,70],输出[batch,1]
X_TrainEv,X_testEv,y_trainEv,y_testEv=train_test_split(LassoFeature,Target, test_size=0.3, random_state=2024)
lasso_modelEvaluateD=LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000)
lasso_modelEvaluateD.fit(X_TrainEv,y_trainEv)
Out[23]:
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
                   solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
                   solver='liblinear')
In [24]:
#使用特征选择后的数据集再次训练模型评估特征选择效果-10折交叉验证
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import cross_validate
def Model_Evelate_CV(Model,Data,Target):
    cv=KFold(n_splits=10, shuffle=True, random_state=2025)
    kv_scores =cross_validate(Model,Data,Target,cv=cv,scoring='accuracy',
                          return_train_score=True)
    print('Fit_time:',kv_scores['fit_time'])
    print('Mean Fit_time:',kv_scores['fit_time'].mean())
    print('score_time:',kv_scores['score_time'])
    print('Mean score_time:',kv_scores['score_time'].mean())
    print('train_score:',kv_scores['train_score'])
    print('Mean train_score:',kv_scores['train_score'].mean())
    print('test_score:',kv_scores['test_score'])
    print('Mean test_score:',kv_scores['test_score'].mean())
#用特征选择后的数据集再次训练模型评估特征选择效果-10折交叉验证
lasso_modelEvaluate=LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000)    # 设置Lasso模型
Model_Evelate_CV(Model=lasso_modelEvaluate,Data=LassoFeature,Target=Target)
Fit_time: [0.13763237 0.15320015 0.17018366 0.14581394 0.12758136 0.13371754
 0.1441164  0.13194776 0.13309407 0.13670778]
Mean Fit_time: 0.14139950275421143
score_time: [0.00301838 0.00292587 0.00316906 0.00275707 0.003124   0.00312471
 0.00284743 0.00251055 0.0025866  0.00309825]
Mean score_time: 0.0029161930084228515
train_score: [0.99688689 0.99740574 0.99723279 0.99723279 0.99723279 0.99740619
 0.99740619 0.99706035 0.99671451 0.99757911]
Mean train_score: 0.9972157356217635
test_score: [0.99377916 0.99377916 0.99533437 0.99688958 0.99688958 0.99221184
 0.99376947 0.99688474 0.99844237 0.9953271 ]
Mean test_score: 0.9953307364718537
In [25]:
#训练集混淆举证
Display_ConfusionMatrix(lasso_modelEvaluateD,X_TrainEv,y_trainEv)
No description has been provided for this image
In [26]:
#测试集混淆举证
Display_ConfusionMatrix(lasso_modelEvaluateD,X_testEv,y_testEv)
No description has been provided for this image
In [27]:
#整合并输出特征选择结果
CG_Feature=Data.iloc[:,LassoFeature_Index]
DNA_Methylstion_Feature=pd.concat([CG_Feature,Data.iloc[:,-7:]],axis=1)
DNA_Methylstion_Feature.shape
Out[27]:
(6425, 66)
In [28]:
#输出特征选择甲基化数据
DNA_Methylstion_Feature.to_pickle('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.pkl')
DNA_Methylstion_Feature.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.xlsx','UTF-8')
DNA_Methylstion_Feature.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.csv')

使用机器学习提取关键甲基化位点

In [1]:
#读取特征选择后的甲基化数据
import pandas as pd
MethylationFeature=pd.read_csv('/mnt/workspace/RA_Methylation_Feature.csv')
MethylationFeature.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6425 entries, 0 to 6424
Data columns (total 67 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           6425 non-null   int64  
 1   cg00455876           6425 non-null   float64
 2   cg05544622           6425 non-null   float64
 3   cg00423014           6425 non-null   float64
 4   cg00478198           6425 non-null   float64
 5   cg00776430           6425 non-null   float64
 6   cg01938887           6425 non-null   float64
 7   cg02714462           6425 non-null   float64
 8   cg02896361           6425 non-null   float64
 9   cg02971902           6425 non-null   float64
 10  cg03601619           6425 non-null   float64
 11  cg04029664           6425 non-null   float64
 12  cg04302300           6425 non-null   float64
 13  cg04699313           6425 non-null   float64
 14  cg05257372           6425 non-null   float64
 15  cg05443523           6425 non-null   float64
 16  cg06411441           6425 non-null   float64
 17  cg07066594           6425 non-null   float64
 18  cg08141049           6425 non-null   float64
 19  cg08311689           6425 non-null   float64
 20  cg09357232           6425 non-null   float64
 21  cg10315562           6425 non-null   float64
 22  cg11704979           6425 non-null   float64
 23  cg11923788           6425 non-null   float64
 24  cg12944030           6425 non-null   float64
 25  cg14061423           6425 non-null   float64
 26  cg14745925           6425 non-null   float64
 27  cg15388264           6425 non-null   float64
 28  cg16078210           6425 non-null   float64
 29  cg17482649           6425 non-null   float64
 30  cg17488844           6425 non-null   float64
 31  cg17794813           6425 non-null   float64
 32  cg19248041           6425 non-null   float64
 33  cg19532714           6425 non-null   float64
 34  cg19603571           6425 non-null   float64
 35  cg19784262           6425 non-null   float64
 36  cg22055524           6425 non-null   float64
 37  cg22219869           6425 non-null   float64
 38  cg22221554           6425 non-null   float64
 39  cg22561883           6425 non-null   float64
 40  cg23299576           6425 non-null   float64
 41  cg23925558           6425 non-null   float64
 42  cg23993836           6425 non-null   float64
 43  cg24627619           6425 non-null   float64
 44  cg26012731           6425 non-null   float64
 45  cg26039926           6425 non-null   float64
 46  cg26936230           6425 non-null   float64
 47  cg00097228           6425 non-null   float64
 48  cg00152515           6425 non-null   float64
 49  cg00306390           6425 non-null   float64
 50  cg00342358           6425 non-null   float64
 51  cg00397635           6425 non-null   float64
 52  cg00455424           6425 non-null   float64
 53  cg00534295           6425 non-null   float64
 54  cg00543485           6425 non-null   float64
 55  cg00581848           6425 non-null   float64
 56  cg01393939           6425 non-null   float64
 57  cg01439876           6425 non-null   float64
 58  cg01446477           6425 non-null   float64
 59  cg01515508           6425 non-null   float64
 60  age                  6425 non-null   float64
 61  gender               6425 non-null   object 
 62  sample_type          6425 non-null   object 
 63  disease              6425 non-null   object 
 64  GenderEncoder        6425 non-null   float64
 65  sample_type_encoder  6425 non-null   int64  
 66  DiseaseEncoder       6425 non-null   int64  
dtypes: float64(61), int64(3), object(3)
memory usage: 3.3+ MB
In [2]:
MethylationFeature.isnull().sum().sum()    #检测空缺数据
Out[2]:
0
In [3]:
#统计性别数据
from collections import Counter
print(Counter(MethylationFeature['gender']))
MethylationFeature['gender'].hist()
Counter({'F': 3430, 'M': 2995})
Out[3]:
<Axes: >
No description has been provided for this image
In [4]:
#统计疾病数据
print(Counter(MethylationFeature['disease']))
MethylationFeature['disease'].hist()
Counter({'control': 6266, 'rheumatoid arthritis': 159})
Out[4]:
<Axes: >
No description has been provided for this image

进行机器学习分析,筛选类风湿性关节炎特征甲基化位点。

In [5]:
Methylation=MethylationFeature.iloc[:,1:-7]    #甲基化数据
MapData=MethylationFeature.loc[:,['age','GenderEncoder','DiseaseEncoder']]    #其他附加数据
MLData=pd.concat([Methylation,MapData],axis=1)    #数据合并
MLData.shape
Out[5]:
(6425, 62)

正常样本与疾病样本总量差异较大,使用数据重采样算法解决数据类别分布不均衡问题。

In [6]:
#查看模型混淆矩阵
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
    ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1])    #计算混淆举证
    Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
    Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
    plt.title('Confusion Matrix')
    plt.show()
In [7]:
#计算测试集PR曲线
from pandas import DataFrame
from sklearn.metrics import precision_recall_curve,accuracy_score
def PR_Curve(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024)    #划分数据集
    predict_score=Model.predict_proba(X_test)[:, 1]    #获取概率值
    predict=Model.predict(X_test)     #获取预测标签
    accuracy=accuracy_score(y_test,predict)
    precision, recall, thresholds = precision_recall_curve(y_test, predict_score)    #计算PR曲线
    PR=DataFrame()    #将PR曲线数据合并到DataFrame
    #PR['thresholds']=thresholds
    PR['recall']=recall
    PR['precision']=precision
    return PR,accuracy
In [8]:
#计算测试集ROC曲线
from sklearn.metrics import roc_curve, auc
def ROC_Curve(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024)    #划分数据集
    predict_score=Model.predict_proba(X_test)[:, 1]    #获取概率值
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    roc_auc = auc(fpr, tpr)    #计算AUC
    ROC=DataFrame()    #将PR曲线数据合并到DataFrame
    #ROC['thresholds']=thresholds
    ROC['tpr']=tpr
    ROC['fpr']=fpr
    return ROC,roc_auc
In [9]:
#编写模型训练评估函数
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
def RunTestModel(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024)    #划分数据集
    Model.fit(X_train,y_train)    #训练模型
    TrainPredict=Model.predict(X_train)    #计算训练集指标
    TrainPrecision=precision_score(y_train,TrainPredict)
    TrainRecall=recall_score(y_train,TrainPredict)
    TrainF1=f1_score(y_train,TrainPredict)
    TrainAcuracy=accuracy_score(y_train,TrainPredict)
    print('模型训练集Precision:{0},Recall:{1},F1_Score:{2},Accuracy:{3}'.format(TrainPrecision,TrainRecall,TrainF1,TrainAcuracy))
    TestPredict=Model.predict(X_test)    #测试集预测结果
    TestPrecision=precision_score(y_test,TestPredict)
    TestRecall=recall_score(y_test,TestPredict)
    TestF1=f1_score(y_test,TestPredict)
    TestAccuracy=accuracy_score(y_test,TestPredict)
    print('模型测试集Precision:{0},Recall:{1},F1_score:{2},Accuracy:{3}'.format(TestPrecision,TestRecall,TestF1,TestAccuracy))
    print('-------------------测试集混淆举证-------------------')
    Display_ConfusionMatrix(model=Model,data=X_test,target=y_test)
In [10]:
#切割数据
MLTestData=MLData.iloc[:,:-1]
MLTestLabel=MLData.iloc[:,-1]
In [11]:
#数据归一化
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler,MinMaxScaler
ColumnsNames=MLTestData.columns
Stand=StandardScaler()
StandData=Stand.fit_transform(MLTestData)
MinMax=MinMaxScaler(feature_range=(0,1))
MinMaxData=MinMax.fit_transform(StandData)
MLTestData=DataFrame(MinMaxData)
MLTestData.columns=ColumnsNames
In [12]:
#使用随机森林算法作为衡量基准测试数据重采样算法性能
from sklearn.ensemble import RandomForestClassifier
Forest=RandomForestClassifier(random_state=2025)
RunTestModel(Model=Forest,Data=MLTestData,Label=MLTestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.75,Recall:0.07142857142857142,F1_score:0.13043478260869565,Accuracy:0.979253112033195
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [14]:
RF_PR,RF_Accuracy=PR_Curve(Model=Forest,Data=MLTestData,Label=MLTestLabel)
In [15]:
RF_ROC,RF_AUC=ROC_Curve(Model=Forest,Data=MLTestData,Label=MLTestLabel)
In [19]:
#输出相关数据
RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/RF_PR.xlsx')
RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/RF_ROC.xlsx')

测试欠采样算法。

In [22]:
#使用CNN算法进行欠采样
from imblearn.under_sampling import CondensedNearestNeighbour
CNN=CondensedNearestNeighbour(sampling_strategy='not minority',random_state=2024,n_jobs=-1)
CNN_TestData,CNN_TestLabel=CNN.fit_resample(MLTestData,MLTestLabel)
Counter(CNN_TestLabel)
Out[22]:
Counter({0: 338, 1: 159})
In [23]:
ForestCNN=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.675,Recall:0.6136363636363636,F1_score:0.6428571428571429,Accuracy:0.8
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [24]:
CNN_RF_PR,CNN_RF_Accuracy=PR_Curve(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
CNN_RF_ROC,CNN_RF_AUC=ROC_Curve(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
In [25]:
#输出相关数据
CNN_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/CNN_RF_PR.xlsx')
CNN_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/CNN_RF_ROC.xlsx')
In [26]:
#使用IHT算法进行欠采样
from imblearn.under_sampling import InstanceHardnessThreshold
IHT=InstanceHardnessThreshold(random_state=2024,cv=5,n_jobs=-1)
IHT_TestData,IHT_TestLabel=IHT.fit_resample(MLTestData,MLTestLabel)
Counter(IHT_TestLabel)
Out[26]:
Counter({0: 3767, 1: 159})
In [27]:
ForestIHT=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:1.0,Recall:0.7719298245614035,F1_score:0.8712871287128713,Accuracy:0.9889643463497453
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [28]:
IHT_RF_PR,IHT_RF_Accuracy=PR_Curve(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
IHT_RF_ROC,IHT_RF_AUC=ROC_Curve(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
#输出相关数据
IHT_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/IHT_RF_PR.xlsx')
IHT_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/IHT_RF_ROC.xlsx')
In [29]:
#使用NearMiss进行欠采样
from imblearn.under_sampling import NearMiss
NM=NearMiss(sampling_strategy='not minority',n_jobs=-1)
NM_TestData,NM_TestLabel=NM.fit_resample(MLTestData,MLTestLabel)
Counter(NM_TestLabel)
Out[29]:
Counter({0: 159, 1: 159})
In [30]:
ForestNM=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.8653846153846154,Recall:0.8490566037735849,F1_score:0.8571428571428571,Accuracy:0.84375
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [31]:
NM_RF_PR,NM_RF_Accuracy=PR_Curve(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
NM_RF_ROC,NM_RF_AUC=ROC_Curve(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
#输出相关数据
NM_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/NM_RF_PR.xlsx')
NM_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/NM_RF_ROC.xlsx')
In [32]:
#使用NeighbourhoodCleaningRule进行欠采样
from imblearn.under_sampling import NeighbourhoodCleaningRule
NBC=NeighbourhoodCleaningRule(sampling_strategy='not minority',n_jobs=-1)
NBC_TestData,NBC_TestLabel=NBC.fit_resample(MLTestData,MLTestLabel)
Counter(NBC_TestLabel)
Out[32]:
Counter({0: 6109, 1: 159})
In [33]:
ForestNBC=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:1.0,Recall:0.24390243902439024,F1_score:0.39215686274509803,Accuracy:0.9835194045720361
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [34]:
NBC_RF_PR,NBC_RF_Accuracy=PR_Curve(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
NBC_RF_ROC,NBC_RF_AUC=ROC_Curve(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
#输出相关数据
NBC_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/NBC_RF_PR.xlsx')
NBC_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/NBC_RF_ROC.xlsx')
In [35]:
#使用OneSidedSelection进行欠采样
from imblearn.under_sampling import OneSidedSelection
OSS=OneSidedSelection(sampling_strategy='not minority',random_state=2024,n_jobs=-1)
OSS_TestData,OSS_TestLabel=OSS.fit_resample(MLTestData,MLTestLabel)
Counter(OSS_TestLabel)
Out[35]:
Counter({0: 6163, 1: 159})
In [36]:
ForestOSS=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:1.0,Recall:0.10869565217391304,F1_score:0.19607843137254902,Accuracy:0.9783869267264101
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [37]:
OSS_RF_PR,OSS_RF_Accuracy=PR_Curve(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
OSS_RF_ROC,OSS_RF_AUC=ROC_Curve(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
#输出相关数据
OSS_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/OSS_RF_PR.xlsx')
OSS_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/OSS_RF_ROC.xlsx')
In [38]:
#使用随机欠采样进行数据欠采样
from imblearn.under_sampling import RandomUnderSampler
RUS=RandomUnderSampler(sampling_strategy='not minority',random_state=2024)
RUS_TestData,RUS_TestLabel=RUS.fit_resample(MLTestData,MLTestLabel)
Counter(RUS_TestLabel)
Out[38]:
Counter({0: 159, 1: 159})
In [39]:
ForestRUS=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.9444444444444444,Recall:0.9622641509433962,F1_score:0.9532710280373832,Accuracy:0.9479166666666666
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [40]:
RUS_RF_PR,RUS_RF_Accuracy=PR_Curve(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
RUS_RF_ROC,RUS_RF_AUC=ROC_Curve(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
#输出相关数据
RUS_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/RUS_RF_PR.xlsx')
RUS_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/RUS_RF_ROC.xlsx')
In [41]:
#绘制PR曲线
import matplotlib.pyplot as plt
PR_curve=plt.figure(dpi=300)
PR_ax=PR_curve.add_subplot(111)
PR_ax.set_title('Precision-Recall curve')
PR_ax.plot(RF_PR['recall'], RF_PR['precision'],color='red',label='RandomForest=%f'%RF_Accuracy)
PR_ax.plot(CNN_RF_PR['recall'], CNN_RF_PR['precision'],color='green',label='CNN=%f'%CNN_RF_Accuracy)
PR_ax.plot(IHT_RF_PR['recall'], IHT_RF_PR['precision'],color='blue',label='IHT=%f'%IHT_RF_Accuracy)
PR_ax.plot(NBC_RF_PR['recall'], NBC_RF_PR['precision'],color='yellow',label='NBC=%f'%NBC_RF_Accuracy)
PR_ax.plot(NM_RF_PR['recall'], NM_RF_PR['precision'],color='purple',label='NM=%f'%NM_RF_Accuracy)
PR_ax.plot(OSS_RF_PR['recall'], OSS_RF_PR['precision'],color='cyan',label='OSS=%f'%OSS_RF_Accuracy)
PR_ax.plot(RUS_RF_PR['recall'], RUS_RF_PR['precision'],color='pink',label='RUS=%f'%RUS_RF_Accuracy)
PR_ax.plot([0,1],[1,0],linestyle='-.',color='black')
PR_ax.set_xlabel('Recall')  
PR_ax.set_ylabel('Precision')    
plt.legend(loc="best")  
plt.show()
No description has been provided for this image
In [42]:
#绘制ROC曲线
import matplotlib.pyplot as plt
ROC_curve=plt.figure(dpi=300)
ROC_ax=ROC_curve.add_subplot(111)
ROC_ax.set_title('ROC Curve')
ROC_ax.plot(RF_ROC['fpr'], RF_ROC['tpr'],color='red',label='RandomForest=%f'%RF_AUC)
ROC_ax.plot(CNN_RF_ROC['fpr'], CNN_RF_ROC['tpr'],color='green',label='CNN=%f'%CNN_RF_AUC)
ROC_ax.plot(IHT_RF_ROC['fpr'], IHT_RF_ROC['tpr'],color='blue',label='IHT=%f'%IHT_RF_AUC)
ROC_ax.plot(NBC_RF_ROC['fpr'], NBC_RF_ROC['tpr'],color='yellow',label='NBC=%f'%NBC_RF_AUC)
ROC_ax.plot(NM_RF_ROC['fpr'], NM_RF_ROC['tpr'],color='purple',label='NM=%f'%NM_RF_AUC)
ROC_ax.plot(OSS_RF_ROC['fpr'], OSS_RF_ROC['tpr'],color='cyan',label='OSS=%f'%OSS_RF_AUC)
ROC_ax.plot(RUS_RF_ROC['fpr'], RUS_RF_ROC['tpr'],color='pink',label='RUS=%f'%RUS_RF_AUC)
ROC_ax.plot([0,1],[0,1],linestyle='-.',color='black')
ROC_ax.set_xlim([-0.05, 1.0])  
ROC_ax.set_ylim([0, 1.05])  
ROC_ax.set_xlabel('FPR')  
ROC_ax.set_ylabel('TPR')    
plt.legend(loc="best")  
plt.show()
No description has been provided for this image
In [122]:
import joblib
Model=[Forest,ForestCNN,ForestIHT,ForestNM,ForestNBC,ForestOSS,ForestRUS]
ModelStr=['Forest.pkl','ForestCNN.pkl','ForestIHT.pkl','ForestNM.pkl','ForestNBC.pkl','ForestOSS.pkl','ForestRUS.pkl']
try:
    for model,modelstr in zip(Model,ModelStr):
        joblib.dump(model,'/mnt/workspace/Analysis Model/Resample Models/'+modelstr)
    print('模型保存成功!')
except:
    print('模型保存异常!!!')
模型保存成功!

经过混淆矩阵、PR曲线、Roc曲线综合评估,选择随机过采样算法进行数据重采样。数据重采样后进行机器学习和深度神经网络训练并采用可解释性机器学习框架提取关键甲基化位点进行特征分析。

In [123]:
MethylationRUS=MethylationFeature.iloc[:,1:-1]
LabelRUS=MethylationFeature.iloc[:,-1]
In [124]:
#合并过采样结果
RUS=RandomUnderSampler(sampling_strategy='not minority',random_state=2024)
RUS_Methylation,RUS_Label=RUS.fit_resample(MethylationRUS,LabelRUS)
Counter(RUS_Label)
Out[124]:
Counter({0: 159, 1: 159})
In [125]:
RUSData=pd.concat([RUS_Methylation,RUS_Label],axis=1)
RUSData.shape
Out[125]:
(318, 66)
In [ ]:
#输出计算合并结果
RUSData.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.csv')
RUSData.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.xlsx','UFT-8')

读取数据进行机器学习提取特征位点。

In [1]:
#读取数据
import pandas as pd
MethylationData=pd.read_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.csv')
MethylationData.shape
Out[1]:
(318, 67)
In [2]:
#检查空缺数据
MethylationData.isnull().sum().sum()
Out[2]:
0
In [3]:
#数据选择和缩放数据
MLFeatureData=MethylationData.iloc[:,1:-6]
MLFeatureData=pd.concat([MLFeatureData,MethylationData.loc[:,'GenderEncoder']],axis=1)
MLFeatureData
Out[3]:
cg00455876 cg05544622 cg00423014 cg00478198 cg00776430 cg01938887 cg02714462 cg02896361 cg02971902 cg03601619 ... cg00455424 cg00534295 cg00543485 cg00581848 cg01393939 cg01439876 cg01446477 cg01515508 age GenderEncoder
0 0.000000 0.000000 1.202370 0.224898 -1.672671 -1.271175 3.201956 -0.480968 -0.739857 3.406479 ... -4.940737 -3.343970 -4.178048 -2.616463 3.149987 -2.207487 -3.100385 3.701353 19.0000 1.0
1 0.000000 0.000000 1.019899 0.409549 -1.578093 -1.130596 3.744756 -0.289957 -0.180452 2.555085 ... -3.790034 -6.163916 -6.811545 -2.468664 0.152263 -2.207487 -3.406479 3.543689 12.0000 1.0
2 0.000000 0.000000 0.241113 0.140201 -0.663152 -1.168907 3.507691 -0.076021 -0.570906 3.029956 ... -2.196336 -3.938986 -4.247583 0.000000 0.405382 -2.414368 -4.051632 1.557086 71.8192 1.0
3 0.510719 4.178048 1.839820 -1.848299 -2.698069 -2.921730 4.807960 -3.701353 -3.938986 2.401172 ... -3.790034 -4.489850 -6.163916 -2.732410 3.124904 -2.337277 -3.228904 4.178048 23.0000 0.0
4 0.000000 0.000000 0.837604 0.351505 -0.809306 -1.456115 4.402578 0.000000 -0.772045 2.681278 ... -3.580953 -3.256540 -5.773449 0.000000 2.241551 -2.414368 -3.659672 3.938986 84.0000 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
313 0.000000 0.000000 0.000000 0.072017 0.000000 -1.798190 3.374769 0.000000 -0.654254 2.300497 ... -3.744756 -4.585271 -3.886935 0.000000 2.441207 -2.496772 -4.051632 4.402578 34.0000 1.0
314 0.000000 3.659672 0.000000 -2.600812 -3.149987 -4.112908 4.585271 0.000000 -4.489850 3.201956 ... -3.790034 -4.178048 -5.273603 0.000000 2.881228 -2.253116 -3.284902 3.993781 51.0000 0.0
315 0.000000 0.000000 1.130596 0.039997 0.000000 -2.163468 3.007447 0.000000 -0.767424 2.020410 ... -4.112908 -4.051632 -6.163916 0.000000 0.000000 -2.570129 -3.701353 3.472874 45.0000 1.0
316 0.000000 0.000000 0.000000 0.140201 0.000000 -1.578093 3.149987 0.000000 -0.506456 2.555085 ... -3.343970 -3.790034 -4.112908 0.000000 2.785946 -2.616463 -3.543689 4.178048 31.0000 1.0
317 0.000000 0.000000 1.672671 -0.027996 -1.082414 -1.141464 2.648415 0.000000 -0.426255 2.632328 ... -3.659672 -4.807960 -5.273603 0.000000 2.349789 -2.349789 -3.993781 4.940737 58.0000 1.0

318 rows × 61 columns

In [4]:
#数据缩放和标准化
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler,MinMaxScaler
FeatureNames=MLFeatureData.columns    #保存变量名
Standard=StandardScaler()    #标准化数据
StandardData=Standard.fit_transform(MLFeatureData)
MinMax=MinMaxScaler(feature_range=(0,1))    #缩放数据
MinMaxData=MinMax.fit_transform(StandardData)
MLRunData=DataFrame(MinMaxData)
MLRunData.columns=FeatureNames
MLRunData.shape
Out[4]:
(318, 61)
In [5]:
#获取标签数据
ReMLLabel=MethylationData.iloc[:,-1]
ReMLLabel
Out[5]:
0      0
1      0
2      0
3      0
4      0
      ..
313    1
314    1
315    1
316    1
317    1
Name: DiseaseEncoder, Length: 318, dtype: int64
In [6]:
#查看模型混淆矩阵
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
    ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1])    #计算混淆举证
    Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
    Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
    plt.title('Confusion Matrix')
    plt.show()
In [7]:
#计算PR曲线
from pandas import DataFrame
from sklearn.metrics import precision_recall_curve,accuracy_score
def PR_Curve(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025)    #划分数据集
    predict_score=Model.predict_proba(X_test)[:, 1]    #获取概率值
    predict=Model.predict(X_test)     #获取预测标签
    accuracy=accuracy_score(y_test,predict)
    precision, recall, thresholds = precision_recall_curve(y_test, predict_score)    #计算PR曲线
    PR=DataFrame()    #将PR曲线数据合并到DataFrame
    #PR['thresholds']=thresholds
    PR['recall']=recall
    PR['precision']=precision
    return PR,accuracy
In [8]:
#计算ROC曲线
from sklearn.metrics import roc_curve, auc
def ROC_Curve(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025)    #划分数据集
    predict_score=Model.predict_proba(X_test)[:, 1]    #获取概率值
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    roc_auc = auc(fpr, tpr)    #计算AUC
    ROC=DataFrame()    #将PR曲线数据合并到DataFrame
    #ROC['thresholds']=thresholds
    ROC['tpr']=tpr
    ROC['fpr']=fpr
    return ROC,roc_auc
In [9]:
#模型训练评估函数
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
def RunMLModel(Model,Data,Label):
    X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025)    #划分数据集
    Model.fit(X_train,y_train)    #训练模型
    TrainPredict=Model.predict(X_train)    #计算训练集指标
    TrainPrecision=precision_score(y_train,TrainPredict)
    TrainRecall=recall_score(y_train,TrainPredict)
    TrainF1=f1_score(y_train,TrainPredict)
    TrainAcuracy=accuracy_score(y_train,TrainPredict)
    print('模型训练集Precision:{0},Recall:{1},F1_Score:{2},Accuracy:{3}'.format(TrainPrecision,TrainRecall,TrainF1,TrainAcuracy))
    TestPredict=Model.predict(X_test)    #测试集预测结果
    TestPrecision=precision_score(y_test,TestPredict)
    TestRecall=recall_score(y_test,TestPredict)
    TestF1=f1_score(y_test,TestPredict)
    TestAccuracy=accuracy_score(y_test,TestPredict)
    print('模型测试集Precision:{0},Recall:{1},F1_score:{2},Accuracy:{3}'.format(TestPrecision,TestRecall,TestF1,TestAccuracy))
    print('-------------------测试集混淆举证-------------------')
    Display_ConfusionMatrix(model=Model,data=X_test,target=y_test)
In [10]:
#测试Logistic回归
from sklearn.linear_model import LogisticRegression
Logistic=LogisticRegression(penalty="l2",dual=True,tol=1e-4,C=1.0,fit_intercept=True,random_state=2024,solver='liblinear',
                            max_iter=100,multi_class="auto",verbose=0, warm_start=False,n_jobs=-1,l1_ratio=None)
RunMLModel(Model=Logistic,Data=MLRunData,Label=ReMLLabel)
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
  warnings.warn(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 8.
  warnings.warn(
模型训练集Precision:0.8870967741935484,Recall:0.9649122807017544,F1_Score:0.9243697478991597,Accuracy:0.918918918918919
模型测试集Precision:0.88,Recall:0.9777777777777777,F1_score:0.9263157894736842,Accuracy:0.9270833333333334
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [11]:
#计算PR曲线和ROC曲线值
Logistic_PR,Logistic_Accuracy=PR_Curve(Model=Logistic,Data=MLRunData,Label=ReMLLabel)    
Logistic_ROC,Logistic_AUC=ROC_Curve(Model=Logistic,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
Logistic_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/Logistic_PR.xlsx','UTF-8')
Logistic_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/Logistic_ROC.xlsx','UTF-8')
/tmp/ipykernel_295/3354021341.py:5: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  Logistic_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/Logistic_PR.xlsx','UTF-8')
/tmp/ipykernel_295/3354021341.py:6: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  Logistic_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/Logistic_ROC.xlsx','UTF-8')
In [12]:
#测试SVM算法
from sklearn.svm import SVC
SVM_Classifier=SVC(C=1.0,kernel="rbf",degree=3,gamma="scale",coef0=0.0,shrinking=True,probability=True,tol=1e-3, 
                   cache_size=200,max_iter=-1, decision_function_shape="ovr",random_state=2024)
RunMLModel(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:0.8934426229508197,Recall:0.956140350877193,F1_Score:0.923728813559322,Accuracy:0.918918918918919
模型测试集Precision:0.9361702127659575,Recall:0.9777777777777777,F1_score:0.9565217391304348,Accuracy:0.9583333333333334
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [13]:
#计算PR曲线和ROC曲线值
SVM_PR,SVM_Accuracy=PR_Curve(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)    
SVM_ROC,SVM_AUC=ROC_Curve(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
SVM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/SVM_PR.xlsx','UTF-8')
SVM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/SVM_ROC.xlsx','UTF-8')
/tmp/ipykernel_295/4264335228.py:5: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  SVM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/SVM_PR.xlsx','UTF-8')
/tmp/ipykernel_295/4264335228.py:6: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  SVM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/SVM_ROC.xlsx','UTF-8')
In [ ]:
#测试决策树算法
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
DecisionTree=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=2,random_state=2024)
Research_params={'max_depth':np.arange(1,20,1,dtype=int),'min_samples_split':np.arange(1,20,1,dtype=int),
                 'criterion':['gini','entropy']}    #超参数列表
GridSearch=GridSearchCV(estimator=DecisionTree,param_grid=Research_params,cv=5,
                        scoring='accuracy',return_train_score=True,n_jobs=-1,verbose=0)
GridSearch.fit(MLRunData,ReMLLabel)
print('超参数搜索最佳超参数为;',GridSearch.best_params_)
print('超参数搜索最佳得分为:',GridSearch.best_score_)
Best_DTC=GridSearch.best_estimator_

超参数搜索最佳超参数为; {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 14}

超参数搜索最佳得分为: 0.88015873015873

In [15]:
RunMLModel(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:1.0,Recall:0.9649122807017544,F1_Score:0.9821428571428571,Accuracy:0.9819819819819819
模型测试集Precision:0.8913043478260869,Recall:0.9111111111111111,F1_score:0.9010989010989011,Accuracy:0.90625
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [ ]:
#计算PR曲线和ROC曲线值
DTC_PR,DTC_Accuracy=PR_Curve(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)    
DTC_ROC,DTC_AUC=ROC_Curve(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
DTC_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/DTC_PR.xlsx','UTF-8')
DTC_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/DTC_ROC.xlsx','UTF-8')
In [ ]:
#对随机森林进行网格搜索
from sklearn.ensemble import RandomForestClassifier
RandomForest=RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=7,min_samples_split=6,
                                    bootstrap=True,min_samples_leaf=2,max_features='auto',random_state=2024)
RandomForest_params={'n_estimators':np.arange(1,40,2,dtype=int),'min_samples_leaf':np.arange(1,40,2,dtype=int),
                     'max_features':['auto','sqrt'],'criterion':['gini','entropy']}
GridSearch_RandomForest=GridSearchCV(estimator=RandomForest,param_grid=RandomForest_params,cv=5,
                        scoring='accuracy',return_train_score=False,n_jobs=-1,verbose=0)
GridSearch_RandomForest.fit(MLRunData,ReMLLabel)
print('超参数搜索最佳超参数为;',GridSearch_RandomForest.best_params_)
print('超参数搜索最佳得分为:',GridSearch_RandomForest.best_score_)
Best_Forest=GridSearch_RandomForest.best_estimator_

超参数搜索最佳超参数为; {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 3, 'n_estimators': 29}

超参数搜索最佳得分为: 0.9526289682539681

In [18]:
#评估最佳模型
RunMLModel(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:0.9912280701754386,Recall:0.9912280701754386,F1_Score:0.9912280701754386,Accuracy:0.990990990990991
模型测试集Precision:0.8913043478260869,Recall:0.9111111111111111,F1_score:0.9010989010989011,Accuracy:0.90625
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [ ]:
#计算PR曲线和ROC曲线值
RondomForest_PR,RondomForest_Accuracy=PR_Curve(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)    
RondomForest_ROC,RondomForest_AUC=ROC_Curve(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
RondomForest_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/RondomForest_PR.xlsx','UTF-8')
RondomForest_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/RondomForest_ROC.xlsx','UTF-8')
In [20]:
#测试XGBoost模型
from xgboost import XGBClassifier
XGBoost=XGBClassifier(n_jobs=-1,verbosity=1,tree_method='auto',gpu_id=0,random_state=2025)
RunMLModel(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [10:06:40] WARNING: /workspace/src/common/error_msg.cc:45: `gpu_id` is deprecated since2.0.0, use `device` instead. E.g. device=cpu/cuda/cuda:0
  warnings.warn(smsg, UserWarning)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.875,Recall:0.9333333333333333,F1_score:0.9032258064516129,Accuracy:0.90625
-------------------测试集混淆举证-------------------
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [10:06:41] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)
No description has been provided for this image
In [ ]:
#计算PR曲线和ROC曲线值
XGBoost_PR,XGBoost_Accuracy=PR_Curve(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)    
XGBoost_ROC,XGBoost_AUC=ROC_Curve(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
XGBoost_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/XGBoost_PR.xlsx','UTF-8')
XGBoost_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/XGBoost_ROC.xlsx','UTF-8')
In [22]:
#测试LightGBM算法
from lightgbm import LGBMClassifier
LightGBM=LGBMClassifier(boosting_type='gbdt',num_leaves=31,learning_rate=0.01,n_estimators=200,
                        n_jobs=-1,objective='binary',metric='binary_logloss',keep_training_booster=True,
                       importance_type='gini',random_state=2025)
RunMLModel(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)
[LightGBM] [Warning] Unknown parameter: keep_training_booster
[LightGBM] [Warning] Unknown parameter: keep_training_booster
[LightGBM] [Info] Number of positive: 114, number of negative: 108
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2829
[LightGBM] [Info] Number of data points in the train set: 222, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513514 -> initscore=0.054067
[LightGBM] [Info] Start training from score 0.054067
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] Unknown parameter: keep_training_booster
模型训练集Precision:0.9911504424778761,Recall:0.9824561403508771,F1_Score:0.986784140969163,Accuracy:0.9864864864864865
[LightGBM] [Warning] Unknown parameter: keep_training_booster
模型测试集Precision:0.8604651162790697,Recall:0.8222222222222222,F1_score:0.8409090909090909,Accuracy:0.8541666666666666
-------------------测试集混淆举证-------------------
[LightGBM] [Warning] Unknown parameter: keep_training_booster
No description has been provided for this image
In [ ]:
#计算PR曲线和ROC曲线值
LightGBM_PR,LightGBM_Accuracy=PR_Curve(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)    
LightGBM_ROC,LightGBM_AUC=ROC_Curve(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
LightGBM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/LightGBM_PR.xlsx','UTF-8')
LightGBM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/LightGBM_ROC.xlsx','UTF-8')
In [24]:
#测试CatBoost算法
from catboost import CatBoostClassifier
CatBoost=CatBoostClassifier(learning_rate=0.01,depth=6,iterations=500,thread_count=-1)
RunMLModel(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)
0:	learn: 0.6846040	total: 52ms	remaining: 25.9s
1:	learn: 0.6759460	total: 54.4ms	remaining: 13.6s
2:	learn: 0.6681809	total: 56.6ms	remaining: 9.38s
3:	learn: 0.6605169	total: 58.6ms	remaining: 7.27s
4:	learn: 0.6532148	total: 60.6ms	remaining: 6s
5:	learn: 0.6457743	total: 62.7ms	remaining: 5.16s
6:	learn: 0.6374344	total: 64.8ms	remaining: 4.56s
7:	learn: 0.6280218	total: 66.9ms	remaining: 4.11s
8:	learn: 0.6199500	total: 69.1ms	remaining: 3.77s
9:	learn: 0.6115360	total: 71.2ms	remaining: 3.49s
10:	learn: 0.6058233	total: 73.3ms	remaining: 3.26s
11:	learn: 0.5972397	total: 75.4ms	remaining: 3.06s
12:	learn: 0.5893468	total: 77.4ms	remaining: 2.9s
13:	learn: 0.5827513	total: 79.3ms	remaining: 2.75s
14:	learn: 0.5732835	total: 81.3ms	remaining: 2.63s
15:	learn: 0.5660295	total: 83.3ms	remaining: 2.52s
16:	learn: 0.5597356	total: 85.3ms	remaining: 2.42s
17:	learn: 0.5534898	total: 87.3ms	remaining: 2.34s
18:	learn: 0.5459730	total: 89.3ms	remaining: 2.26s
19:	learn: 0.5379904	total: 91.3ms	remaining: 2.19s
20:	learn: 0.5325712	total: 93.3ms	remaining: 2.13s
21:	learn: 0.5269033	total: 95.3ms	remaining: 2.07s
22:	learn: 0.5201728	total: 97.3ms	remaining: 2.02s
23:	learn: 0.5118725	total: 99.3ms	remaining: 1.97s
24:	learn: 0.5057485	total: 101ms	remaining: 1.92s
25:	learn: 0.5000583	total: 103ms	remaining: 1.88s
26:	learn: 0.4951915	total: 105ms	remaining: 1.84s
27:	learn: 0.4909327	total: 107ms	remaining: 1.81s
28:	learn: 0.4842253	total: 109ms	remaining: 1.77s
29:	learn: 0.4787442	total: 111ms	remaining: 1.74s
30:	learn: 0.4723441	total: 113ms	remaining: 1.71s
31:	learn: 0.4655272	total: 115ms	remaining: 1.69s
32:	learn: 0.4613936	total: 117ms	remaining: 1.66s
33:	learn: 0.4551019	total: 119ms	remaining: 1.64s
34:	learn: 0.4490847	total: 121ms	remaining: 1.61s
35:	learn: 0.4438207	total: 123ms	remaining: 1.59s
36:	learn: 0.4400339	total: 125ms	remaining: 1.57s
37:	learn: 0.4349726	total: 127ms	remaining: 1.55s
38:	learn: 0.4311170	total: 129ms	remaining: 1.53s
39:	learn: 0.4265656	total: 132ms	remaining: 1.51s
40:	learn: 0.4223958	total: 134ms	remaining: 1.5s
41:	learn: 0.4177472	total: 136ms	remaining: 1.48s
42:	learn: 0.4135435	total: 138ms	remaining: 1.46s
43:	learn: 0.4082540	total: 140ms	remaining: 1.45s
44:	learn: 0.4047728	total: 142ms	remaining: 1.43s
45:	learn: 0.4005827	total: 144ms	remaining: 1.42s
46:	learn: 0.3966837	total: 146ms	remaining: 1.41s
47:	learn: 0.3928355	total: 148ms	remaining: 1.4s
48:	learn: 0.3892496	total: 150ms	remaining: 1.38s
49:	learn: 0.3851300	total: 153ms	remaining: 1.37s
50:	learn: 0.3804993	total: 155ms	remaining: 1.36s
51:	learn: 0.3766210	total: 157ms	remaining: 1.35s
52:	learn: 0.3720525	total: 159ms	remaining: 1.34s
53:	learn: 0.3687800	total: 161ms	remaining: 1.33s
54:	learn: 0.3656090	total: 163ms	remaining: 1.32s
55:	learn: 0.3623371	total: 165ms	remaining: 1.31s
56:	learn: 0.3587285	total: 167ms	remaining: 1.3s
57:	learn: 0.3559665	total: 169ms	remaining: 1.29s
58:	learn: 0.3525551	total: 172ms	remaining: 1.28s
59:	learn: 0.3502077	total: 174ms	remaining: 1.27s
60:	learn: 0.3461714	total: 175ms	remaining: 1.26s
61:	learn: 0.3431533	total: 178ms	remaining: 1.25s
62:	learn: 0.3387627	total: 180ms	remaining: 1.25s
63:	learn: 0.3347702	total: 181ms	remaining: 1.24s
64:	learn: 0.3312209	total: 184ms	remaining: 1.23s
65:	learn: 0.3283847	total: 185ms	remaining: 1.22s
66:	learn: 0.3250618	total: 188ms	remaining: 1.21s
67:	learn: 0.3214170	total: 189ms	remaining: 1.2s
68:	learn: 0.3192749	total: 191ms	remaining: 1.2s
69:	learn: 0.3161495	total: 194ms	remaining: 1.19s
70:	learn: 0.3131933	total: 196ms	remaining: 1.18s
71:	learn: 0.3107692	total: 198ms	remaining: 1.18s
72:	learn: 0.3077083	total: 200ms	remaining: 1.17s
73:	learn: 0.3037429	total: 202ms	remaining: 1.16s
74:	learn: 0.3020042	total: 204ms	remaining: 1.15s
75:	learn: 0.2988086	total: 206ms	remaining: 1.15s
76:	learn: 0.2959006	total: 208ms	remaining: 1.14s
77:	learn: 0.2932654	total: 210ms	remaining: 1.13s
78:	learn: 0.2907278	total: 212ms	remaining: 1.13s
79:	learn: 0.2884350	total: 214ms	remaining: 1.12s
80:	learn: 0.2863270	total: 216ms	remaining: 1.12s
81:	learn: 0.2835952	total: 218ms	remaining: 1.11s
82:	learn: 0.2805909	total: 220ms	remaining: 1.1s
83:	learn: 0.2781724	total: 222ms	remaining: 1.1s
84:	learn: 0.2757498	total: 224ms	remaining: 1.09s
85:	learn: 0.2740257	total: 226ms	remaining: 1.09s
86:	learn: 0.2714518	total: 228ms	remaining: 1.08s
87:	learn: 0.2693197	total: 230ms	remaining: 1.08s
88:	learn: 0.2670931	total: 232ms	remaining: 1.07s
89:	learn: 0.2649242	total: 234ms	remaining: 1.07s
90:	learn: 0.2620552	total: 236ms	remaining: 1.06s
91:	learn: 0.2605563	total: 238ms	remaining: 1.06s
92:	learn: 0.2582206	total: 240ms	remaining: 1.05s
93:	learn: 0.2567592	total: 242ms	remaining: 1.05s
94:	learn: 0.2546125	total: 244ms	remaining: 1.04s
95:	learn: 0.2527879	total: 247ms	remaining: 1.04s
96:	learn: 0.2503207	total: 250ms	remaining: 1.04s
97:	learn: 0.2474673	total: 252ms	remaining: 1.03s
98:	learn: 0.2453658	total: 254ms	remaining: 1.03s
99:	learn: 0.2432069	total: 256ms	remaining: 1.02s
100:	learn: 0.2409437	total: 258ms	remaining: 1.02s
101:	learn: 0.2393565	total: 260ms	remaining: 1.01s
102:	learn: 0.2369362	total: 262ms	remaining: 1.01s
103:	learn: 0.2353270	total: 264ms	remaining: 1s
104:	learn: 0.2337941	total: 266ms	remaining: 1s
105:	learn: 0.2318099	total: 268ms	remaining: 996ms
106:	learn: 0.2298857	total: 270ms	remaining: 992ms
107:	learn: 0.2282663	total: 272ms	remaining: 988ms
108:	learn: 0.2264022	total: 274ms	remaining: 984ms
109:	learn: 0.2247401	total: 276ms	remaining: 979ms
110:	learn: 0.2229827	total: 278ms	remaining: 975ms
111:	learn: 0.2212838	total: 280ms	remaining: 970ms
112:	learn: 0.2198683	total: 282ms	remaining: 966ms
113:	learn: 0.2181729	total: 284ms	remaining: 962ms
114:	learn: 0.2162552	total: 286ms	remaining: 958ms
115:	learn: 0.2144204	total: 288ms	remaining: 954ms
116:	learn: 0.2127768	total: 290ms	remaining: 950ms
117:	learn: 0.2113537	total: 292ms	remaining: 946ms
118:	learn: 0.2099306	total: 294ms	remaining: 943ms
119:	learn: 0.2090040	total: 296ms	remaining: 939ms
120:	learn: 0.2073799	total: 298ms	remaining: 935ms
121:	learn: 0.2059616	total: 300ms	remaining: 931ms
122:	learn: 0.2044892	total: 302ms	remaining: 927ms
123:	learn: 0.2027767	total: 304ms	remaining: 923ms
124:	learn: 0.2017581	total: 306ms	remaining: 919ms
125:	learn: 0.2002303	total: 308ms	remaining: 916ms
126:	learn: 0.1982704	total: 311ms	remaining: 912ms
127:	learn: 0.1968538	total: 312ms	remaining: 908ms
128:	learn: 0.1947914	total: 315ms	remaining: 905ms
129:	learn: 0.1934204	total: 317ms	remaining: 901ms
130:	learn: 0.1919906	total: 319ms	remaining: 898ms
131:	learn: 0.1906720	total: 321ms	remaining: 894ms
132:	learn: 0.1893568	total: 323ms	remaining: 891ms
133:	learn: 0.1876434	total: 325ms	remaining: 887ms
134:	learn: 0.1864395	total: 327ms	remaining: 884ms
135:	learn: 0.1852495	total: 329ms	remaining: 880ms
136:	learn: 0.1840756	total: 331ms	remaining: 876ms
137:	learn: 0.1826665	total: 333ms	remaining: 873ms
138:	learn: 0.1810117	total: 335ms	remaining: 870ms
139:	learn: 0.1800771	total: 337ms	remaining: 866ms
140:	learn: 0.1789984	total: 339ms	remaining: 863ms
141:	learn: 0.1774704	total: 341ms	remaining: 859ms
142:	learn: 0.1761096	total: 343ms	remaining: 856ms
143:	learn: 0.1743690	total: 345ms	remaining: 853ms
144:	learn: 0.1732425	total: 347ms	remaining: 850ms
145:	learn: 0.1723546	total: 349ms	remaining: 847ms
146:	learn: 0.1710597	total: 351ms	remaining: 843ms
147:	learn: 0.1700890	total: 353ms	remaining: 840ms
148:	learn: 0.1689187	total: 355ms	remaining: 837ms
149:	learn: 0.1675242	total: 357ms	remaining: 834ms
150:	learn: 0.1663232	total: 359ms	remaining: 830ms
151:	learn: 0.1650952	total: 361ms	remaining: 827ms
152:	learn: 0.1638753	total: 363ms	remaining: 824ms
153:	learn: 0.1625407	total: 365ms	remaining: 821ms
154:	learn: 0.1611600	total: 367ms	remaining: 817ms
155:	learn: 0.1604441	total: 369ms	remaining: 814ms
156:	learn: 0.1594782	total: 371ms	remaining: 811ms
157:	learn: 0.1581012	total: 373ms	remaining: 808ms
158:	learn: 0.1568787	total: 375ms	remaining: 805ms
159:	learn: 0.1557659	total: 377ms	remaining: 802ms
160:	learn: 0.1550170	total: 379ms	remaining: 799ms
161:	learn: 0.1537192	total: 382ms	remaining: 796ms
162:	learn: 0.1530482	total: 384ms	remaining: 793ms
163:	learn: 0.1521747	total: 386ms	remaining: 790ms
164:	learn: 0.1512109	total: 388ms	remaining: 787ms
165:	learn: 0.1502230	total: 390ms	remaining: 784ms
166:	learn: 0.1492235	total: 392ms	remaining: 781ms
167:	learn: 0.1483271	total: 394ms	remaining: 778ms
168:	learn: 0.1472658	total: 396ms	remaining: 775ms
169:	learn: 0.1463962	total: 398ms	remaining: 772ms
170:	learn: 0.1452737	total: 400ms	remaining: 769ms
171:	learn: 0.1445030	total: 402ms	remaining: 766ms
172:	learn: 0.1436421	total: 404ms	remaining: 763ms
173:	learn: 0.1426043	total: 406ms	remaining: 760ms
174:	learn: 0.1417990	total: 408ms	remaining: 757ms
175:	learn: 0.1409964	total: 410ms	remaining: 754ms
176:	learn: 0.1400960	total: 412ms	remaining: 751ms
177:	learn: 0.1392026	total: 414ms	remaining: 748ms
178:	learn: 0.1382591	total: 416ms	remaining: 745ms
179:	learn: 0.1375002	total: 418ms	remaining: 743ms
180:	learn: 0.1365168	total: 420ms	remaining: 740ms
181:	learn: 0.1355343	total: 422ms	remaining: 737ms
182:	learn: 0.1346431	total: 424ms	remaining: 734ms
183:	learn: 0.1339091	total: 426ms	remaining: 731ms
184:	learn: 0.1330647	total: 428ms	remaining: 729ms
185:	learn: 0.1323216	total: 430ms	remaining: 726ms
186:	learn: 0.1315324	total: 432ms	remaining: 723ms
187:	learn: 0.1307566	total: 434ms	remaining: 720ms
188:	learn: 0.1299676	total: 436ms	remaining: 717ms
189:	learn: 0.1292623	total: 438ms	remaining: 715ms
190:	learn: 0.1284551	total: 440ms	remaining: 712ms
191:	learn: 0.1276340	total: 442ms	remaining: 709ms
192:	learn: 0.1271059	total: 444ms	remaining: 707ms
193:	learn: 0.1264945	total: 447ms	remaining: 705ms
194:	learn: 0.1258445	total: 449ms	remaining: 702ms
195:	learn: 0.1249638	total: 451ms	remaining: 699ms
196:	learn: 0.1241766	total: 453ms	remaining: 697ms
197:	learn: 0.1235573	total: 455ms	remaining: 694ms
198:	learn: 0.1229102	total: 457ms	remaining: 691ms
199:	learn: 0.1223410	total: 459ms	remaining: 688ms
200:	learn: 0.1216837	total: 461ms	remaining: 686ms
201:	learn: 0.1210594	total: 463ms	remaining: 683ms
202:	learn: 0.1205716	total: 465ms	remaining: 681ms
203:	learn: 0.1201110	total: 467ms	remaining: 678ms
204:	learn: 0.1195497	total: 469ms	remaining: 675ms
205:	learn: 0.1189083	total: 471ms	remaining: 673ms
206:	learn: 0.1182833	total: 473ms	remaining: 670ms
207:	learn: 0.1174048	total: 475ms	remaining: 667ms
208:	learn: 0.1167230	total: 477ms	remaining: 665ms
209:	learn: 0.1161288	total: 480ms	remaining: 662ms
210:	learn: 0.1154258	total: 482ms	remaining: 660ms
211:	learn: 0.1148013	total: 484ms	remaining: 657ms
212:	learn: 0.1143376	total: 485ms	remaining: 654ms
213:	learn: 0.1134639	total: 487ms	remaining: 651ms
214:	learn: 0.1127821	total: 490ms	remaining: 649ms
215:	learn: 0.1122507	total: 492ms	remaining: 646ms
216:	learn: 0.1113537	total: 494ms	remaining: 644ms
217:	learn: 0.1106922	total: 496ms	remaining: 641ms
218:	learn: 0.1100270	total: 498ms	remaining: 639ms
219:	learn: 0.1095422	total: 500ms	remaining: 636ms
220:	learn: 0.1090378	total: 502ms	remaining: 634ms
221:	learn: 0.1083041	total: 504ms	remaining: 631ms
222:	learn: 0.1079058	total: 506ms	remaining: 628ms
223:	learn: 0.1074226	total: 508ms	remaining: 626ms
224:	learn: 0.1065478	total: 510ms	remaining: 623ms
225:	learn: 0.1060412	total: 512ms	remaining: 620ms
226:	learn: 0.1055783	total: 514ms	remaining: 618ms
227:	learn: 0.1048917	total: 516ms	remaining: 616ms
228:	learn: 0.1042313	total: 518ms	remaining: 613ms
229:	learn: 0.1036835	total: 520ms	remaining: 611ms
230:	learn: 0.1031781	total: 522ms	remaining: 608ms
231:	learn: 0.1026748	total: 524ms	remaining: 606ms
232:	learn: 0.1021898	total: 526ms	remaining: 603ms
233:	learn: 0.1018561	total: 528ms	remaining: 600ms
234:	learn: 0.1014298	total: 530ms	remaining: 598ms
235:	learn: 0.1008389	total: 532ms	remaining: 595ms
236:	learn: 0.1002588	total: 534ms	remaining: 593ms
237:	learn: 0.0998770	total: 536ms	remaining: 590ms
238:	learn: 0.0992447	total: 538ms	remaining: 588ms
239:	learn: 0.0985791	total: 540ms	remaining: 585ms
240:	learn: 0.0980068	total: 542ms	remaining: 583ms
241:	learn: 0.0975926	total: 544ms	remaining: 580ms
242:	learn: 0.0971103	total: 546ms	remaining: 578ms
243:	learn: 0.0965935	total: 548ms	remaining: 575ms
244:	learn: 0.0959998	total: 550ms	remaining: 573ms
245:	learn: 0.0953505	total: 552ms	remaining: 570ms
246:	learn: 0.0949572	total: 554ms	remaining: 568ms
247:	learn: 0.0944275	total: 556ms	remaining: 565ms
248:	learn: 0.0939614	total: 558ms	remaining: 563ms
249:	learn: 0.0934980	total: 560ms	remaining: 560ms
250:	learn: 0.0929929	total: 562ms	remaining: 558ms
251:	learn: 0.0926004	total: 564ms	remaining: 555ms
252:	learn: 0.0922454	total: 566ms	remaining: 553ms
253:	learn: 0.0916853	total: 568ms	remaining: 550ms
254:	learn: 0.0913217	total: 571ms	remaining: 548ms
255:	learn: 0.0909009	total: 573ms	remaining: 546ms
256:	learn: 0.0904650	total: 575ms	remaining: 543ms
257:	learn: 0.0900383	total: 577ms	remaining: 541ms
258:	learn: 0.0895855	total: 579ms	remaining: 538ms
259:	learn: 0.0892367	total: 581ms	remaining: 536ms
260:	learn: 0.0889366	total: 583ms	remaining: 534ms
261:	learn: 0.0886641	total: 585ms	remaining: 531ms
262:	learn: 0.0882293	total: 587ms	remaining: 529ms
263:	learn: 0.0877998	total: 589ms	remaining: 526ms
264:	learn: 0.0872884	total: 591ms	remaining: 524ms
265:	learn: 0.0867746	total: 593ms	remaining: 521ms
266:	learn: 0.0862749	total: 595ms	remaining: 519ms
267:	learn: 0.0859732	total: 597ms	remaining: 517ms
268:	learn: 0.0854699	total: 599ms	remaining: 514ms
269:	learn: 0.0851456	total: 601ms	remaining: 512ms
270:	learn: 0.0848958	total: 603ms	remaining: 509ms
271:	learn: 0.0845757	total: 605ms	remaining: 507ms
272:	learn: 0.0841830	total: 607ms	remaining: 505ms
273:	learn: 0.0836906	total: 609ms	remaining: 502ms
274:	learn: 0.0833261	total: 611ms	remaining: 500ms
275:	learn: 0.0827048	total: 613ms	remaining: 498ms
276:	learn: 0.0823247	total: 615ms	remaining: 495ms
277:	learn: 0.0819396	total: 617ms	remaining: 493ms
278:	learn: 0.0816164	total: 619ms	remaining: 490ms
279:	learn: 0.0812882	total: 621ms	remaining: 488ms
280:	learn: 0.0809456	total: 623ms	remaining: 486ms
281:	learn: 0.0805355	total: 625ms	remaining: 483ms
282:	learn: 0.0800955	total: 627ms	remaining: 481ms
283:	learn: 0.0796182	total: 630ms	remaining: 479ms
284:	learn: 0.0793585	total: 632ms	remaining: 476ms
285:	learn: 0.0790722	total: 634ms	remaining: 474ms
286:	learn: 0.0786436	total: 636ms	remaining: 472ms
287:	learn: 0.0782855	total: 638ms	remaining: 470ms
288:	learn: 0.0777196	total: 640ms	remaining: 468ms
289:	learn: 0.0774455	total: 643ms	remaining: 465ms
290:	learn: 0.0771741	total: 645ms	remaining: 463ms
291:	learn: 0.0767213	total: 647ms	remaining: 461ms
292:	learn: 0.0763101	total: 649ms	remaining: 458ms
293:	learn: 0.0759696	total: 651ms	remaining: 456ms
294:	learn: 0.0756114	total: 653ms	remaining: 454ms
295:	learn: 0.0752771	total: 655ms	remaining: 451ms
296:	learn: 0.0749658	total: 657ms	remaining: 449ms
297:	learn: 0.0745448	total: 659ms	remaining: 447ms
298:	learn: 0.0742472	total: 661ms	remaining: 444ms
299:	learn: 0.0739642	total: 663ms	remaining: 442ms
300:	learn: 0.0736538	total: 665ms	remaining: 440ms
301:	learn: 0.0731854	total: 667ms	remaining: 437ms
302:	learn: 0.0729014	total: 669ms	remaining: 435ms
303:	learn: 0.0726266	total: 671ms	remaining: 432ms
304:	learn: 0.0723980	total: 673ms	remaining: 430ms
305:	learn: 0.0720825	total: 675ms	remaining: 428ms
306:	learn: 0.0717842	total: 677ms	remaining: 425ms
307:	learn: 0.0715165	total: 679ms	remaining: 423ms
308:	learn: 0.0712906	total: 681ms	remaining: 421ms
309:	learn: 0.0709843	total: 683ms	remaining: 418ms
310:	learn: 0.0706904	total: 685ms	remaining: 416ms
311:	learn: 0.0703273	total: 687ms	remaining: 414ms
312:	learn: 0.0699863	total: 689ms	remaining: 411ms
313:	learn: 0.0697272	total: 691ms	remaining: 409ms
314:	learn: 0.0693374	total: 693ms	remaining: 407ms
315:	learn: 0.0690057	total: 695ms	remaining: 405ms
316:	learn: 0.0687646	total: 697ms	remaining: 402ms
317:	learn: 0.0684913	total: 699ms	remaining: 400ms
318:	learn: 0.0682175	total: 701ms	remaining: 398ms
319:	learn: 0.0679486	total: 703ms	remaining: 395ms
320:	learn: 0.0675699	total: 705ms	remaining: 393ms
321:	learn: 0.0673991	total: 707ms	remaining: 391ms
322:	learn: 0.0671463	total: 709ms	remaining: 389ms
323:	learn: 0.0668977	total: 711ms	remaining: 386ms
324:	learn: 0.0665579	total: 713ms	remaining: 384ms
325:	learn: 0.0663293	total: 715ms	remaining: 382ms
326:	learn: 0.0660506	total: 718ms	remaining: 380ms
327:	learn: 0.0658147	total: 720ms	remaining: 377ms
328:	learn: 0.0655132	total: 722ms	remaining: 375ms
329:	learn: 0.0652340	total: 724ms	remaining: 373ms
330:	learn: 0.0649374	total: 726ms	remaining: 371ms
331:	learn: 0.0647005	total: 728ms	remaining: 368ms
332:	learn: 0.0644140	total: 730ms	remaining: 366ms
333:	learn: 0.0641768	total: 733ms	remaining: 364ms
334:	learn: 0.0638923	total: 735ms	remaining: 362ms
335:	learn: 0.0635807	total: 737ms	remaining: 360ms
336:	learn: 0.0633689	total: 739ms	remaining: 357ms
337:	learn: 0.0630989	total: 741ms	remaining: 355ms
338:	learn: 0.0628456	total: 743ms	remaining: 353ms
339:	learn: 0.0625842	total: 745ms	remaining: 351ms
340:	learn: 0.0622742	total: 747ms	remaining: 348ms
341:	learn: 0.0620714	total: 749ms	remaining: 346ms
342:	learn: 0.0617471	total: 751ms	remaining: 344ms
343:	learn: 0.0615121	total: 753ms	remaining: 341ms
344:	learn: 0.0612585	total: 755ms	remaining: 339ms
345:	learn: 0.0610550	total: 757ms	remaining: 337ms
346:	learn: 0.0608039	total: 759ms	remaining: 335ms
347:	learn: 0.0604924	total: 761ms	remaining: 332ms
348:	learn: 0.0601795	total: 763ms	remaining: 330ms
349:	learn: 0.0598725	total: 765ms	remaining: 328ms
350:	learn: 0.0596511	total: 767ms	remaining: 326ms
351:	learn: 0.0594457	total: 769ms	remaining: 323ms
352:	learn: 0.0592154	total: 771ms	remaining: 321ms
353:	learn: 0.0589629	total: 773ms	remaining: 319ms
354:	learn: 0.0586987	total: 775ms	remaining: 317ms
355:	learn: 0.0584759	total: 777ms	remaining: 314ms
356:	learn: 0.0582162	total: 779ms	remaining: 312ms
357:	learn: 0.0579987	total: 781ms	remaining: 310ms
358:	learn: 0.0576667	total: 783ms	remaining: 308ms
359:	learn: 0.0574809	total: 785ms	remaining: 305ms
360:	learn: 0.0573083	total: 787ms	remaining: 303ms
361:	learn: 0.0571192	total: 789ms	remaining: 301ms
362:	learn: 0.0569243	total: 791ms	remaining: 299ms
363:	learn: 0.0567261	total: 793ms	remaining: 296ms
364:	learn: 0.0564865	total: 795ms	remaining: 294ms
365:	learn: 0.0562840	total: 797ms	remaining: 292ms
366:	learn: 0.0560785	total: 799ms	remaining: 290ms
367:	learn: 0.0558322	total: 802ms	remaining: 288ms
368:	learn: 0.0556182	total: 804ms	remaining: 285ms
369:	learn: 0.0554483	total: 806ms	remaining: 283ms
370:	learn: 0.0553145	total: 808ms	remaining: 281ms
371:	learn: 0.0551290	total: 810ms	remaining: 279ms
372:	learn: 0.0548128	total: 812ms	remaining: 276ms
373:	learn: 0.0546286	total: 814ms	remaining: 274ms
374:	learn: 0.0544126	total: 816ms	remaining: 272ms
375:	learn: 0.0541702	total: 818ms	remaining: 270ms
376:	learn: 0.0540095	total: 820ms	remaining: 268ms
377:	learn: 0.0537582	total: 822ms	remaining: 265ms
378:	learn: 0.0535366	total: 825ms	remaining: 263ms
379:	learn: 0.0532767	total: 827ms	remaining: 261ms
380:	learn: 0.0530737	total: 830ms	remaining: 259ms
381:	learn: 0.0529433	total: 832ms	remaining: 257ms
382:	learn: 0.0526476	total: 835ms	remaining: 255ms
383:	learn: 0.0524374	total: 837ms	remaining: 253ms
384:	learn: 0.0522682	total: 839ms	remaining: 251ms
385:	learn: 0.0521002	total: 842ms	remaining: 249ms
386:	learn: 0.0517807	total: 845ms	remaining: 247ms
387:	learn: 0.0516491	total: 847ms	remaining: 244ms
388:	learn: 0.0513977	total: 849ms	remaining: 242ms
389:	learn: 0.0511981	total: 851ms	remaining: 240ms
390:	learn: 0.0509928	total: 853ms	remaining: 238ms
391:	learn: 0.0508100	total: 855ms	remaining: 235ms
392:	learn: 0.0506515	total: 857ms	remaining: 233ms
393:	learn: 0.0504879	total: 859ms	remaining: 231ms
394:	learn: 0.0503601	total: 861ms	remaining: 229ms
395:	learn: 0.0501811	total: 863ms	remaining: 227ms
396:	learn: 0.0499619	total: 865ms	remaining: 224ms
397:	learn: 0.0497344	total: 867ms	remaining: 222ms
398:	learn: 0.0494894	total: 869ms	remaining: 220ms
399:	learn: 0.0492946	total: 871ms	remaining: 218ms
400:	learn: 0.0491168	total: 873ms	remaining: 215ms
401:	learn: 0.0489381	total: 875ms	remaining: 213ms
402:	learn: 0.0487662	total: 877ms	remaining: 211ms
403:	learn: 0.0485960	total: 879ms	remaining: 209ms
404:	learn: 0.0484260	total: 881ms	remaining: 207ms
405:	learn: 0.0482357	total: 883ms	remaining: 204ms
406:	learn: 0.0480300	total: 885ms	remaining: 202ms
407:	learn: 0.0478388	total: 887ms	remaining: 200ms
408:	learn: 0.0477056	total: 889ms	remaining: 198ms
409:	learn: 0.0475431	total: 891ms	remaining: 196ms
410:	learn: 0.0473370	total: 893ms	remaining: 193ms
411:	learn: 0.0471485	total: 895ms	remaining: 191ms
412:	learn: 0.0470046	total: 897ms	remaining: 189ms
413:	learn: 0.0467841	total: 899ms	remaining: 187ms
414:	learn: 0.0466280	total: 901ms	remaining: 185ms
415:	learn: 0.0464732	total: 903ms	remaining: 182ms
416:	learn: 0.0462847	total: 905ms	remaining: 180ms
417:	learn: 0.0461002	total: 907ms	remaining: 178ms
418:	learn: 0.0459447	total: 909ms	remaining: 176ms
419:	learn: 0.0458015	total: 911ms	remaining: 174ms
420:	learn: 0.0456173	total: 913ms	remaining: 171ms
421:	learn: 0.0454063	total: 915ms	remaining: 169ms
422:	learn: 0.0452413	total: 917ms	remaining: 167ms
423:	learn: 0.0451065	total: 919ms	remaining: 165ms
424:	learn: 0.0449328	total: 921ms	remaining: 163ms
425:	learn: 0.0447209	total: 923ms	remaining: 160ms
426:	learn: 0.0445578	total: 925ms	remaining: 158ms
427:	learn: 0.0443970	total: 927ms	remaining: 156ms
428:	learn: 0.0441543	total: 929ms	remaining: 154ms
429:	learn: 0.0440567	total: 931ms	remaining: 152ms
430:	learn: 0.0438545	total: 933ms	remaining: 149ms
431:	learn: 0.0437481	total: 935ms	remaining: 147ms
432:	learn: 0.0435939	total: 937ms	remaining: 145ms
433:	learn: 0.0434315	total: 939ms	remaining: 143ms
434:	learn: 0.0432680	total: 941ms	remaining: 141ms
435:	learn: 0.0431179	total: 944ms	remaining: 138ms
436:	learn: 0.0429558	total: 946ms	remaining: 136ms
437:	learn: 0.0427794	total: 948ms	remaining: 134ms
438:	learn: 0.0426621	total: 950ms	remaining: 132ms
439:	learn: 0.0424642	total: 952ms	remaining: 130ms
440:	learn: 0.0422749	total: 954ms	remaining: 128ms
441:	learn: 0.0420968	total: 956ms	remaining: 125ms
442:	learn: 0.0419342	total: 958ms	remaining: 123ms
443:	learn: 0.0417983	total: 960ms	remaining: 121ms
444:	learn: 0.0416726	total: 962ms	remaining: 119ms
445:	learn: 0.0414811	total: 964ms	remaining: 117ms
446:	learn: 0.0413471	total: 966ms	remaining: 115ms
447:	learn: 0.0412036	total: 968ms	remaining: 112ms
448:	learn: 0.0410487	total: 970ms	remaining: 110ms
449:	learn: 0.0408936	total: 972ms	remaining: 108ms
450:	learn: 0.0407416	total: 974ms	remaining: 106ms
451:	learn: 0.0405291	total: 976ms	remaining: 104ms
452:	learn: 0.0404189	total: 978ms	remaining: 102ms
453:	learn: 0.0402506	total: 980ms	remaining: 99.3ms
454:	learn: 0.0401505	total: 982ms	remaining: 97.1ms
455:	learn: 0.0399942	total: 984ms	remaining: 95ms
456:	learn: 0.0398390	total: 986ms	remaining: 92.8ms
457:	learn: 0.0396801	total: 988ms	remaining: 90.6ms
458:	learn: 0.0394797	total: 990ms	remaining: 88.5ms
459:	learn: 0.0393357	total: 992ms	remaining: 86.3ms
460:	learn: 0.0392137	total: 994ms	remaining: 84.1ms
461:	learn: 0.0390646	total: 996ms	remaining: 82ms
462:	learn: 0.0389406	total: 998ms	remaining: 79.8ms
463:	learn: 0.0388542	total: 1s	remaining: 77.6ms
464:	learn: 0.0387217	total: 1s	remaining: 75.4ms
465:	learn: 0.0386027	total: 1s	remaining: 73.3ms
466:	learn: 0.0384044	total: 1.01s	remaining: 71.1ms
467:	learn: 0.0382599	total: 1.01s	remaining: 69ms
468:	learn: 0.0380873	total: 1.01s	remaining: 66.8ms
469:	learn: 0.0379271	total: 1.01s	remaining: 64.6ms
470:	learn: 0.0377469	total: 1.01s	remaining: 62.5ms
471:	learn: 0.0376363	total: 1.02s	remaining: 60.3ms
472:	learn: 0.0374950	total: 1.02s	remaining: 58.1ms
473:	learn: 0.0373768	total: 1.02s	remaining: 56ms
474:	learn: 0.0372549	total: 1.02s	remaining: 53.8ms
475:	learn: 0.0370989	total: 1.02s	remaining: 51.7ms
476:	learn: 0.0369066	total: 1.03s	remaining: 49.5ms
477:	learn: 0.0367730	total: 1.03s	remaining: 47.3ms
478:	learn: 0.0366318	total: 1.03s	remaining: 45.2ms
479:	learn: 0.0364936	total: 1.03s	remaining: 43ms
480:	learn: 0.0363073	total: 1.03s	remaining: 40.9ms
481:	learn: 0.0361262	total: 1.04s	remaining: 38.7ms
482:	learn: 0.0359913	total: 1.04s	remaining: 36.6ms
483:	learn: 0.0358801	total: 1.04s	remaining: 34.4ms
484:	learn: 0.0357930	total: 1.04s	remaining: 32.3ms
485:	learn: 0.0356674	total: 1.04s	remaining: 30.1ms
486:	learn: 0.0355692	total: 1.05s	remaining: 28ms
487:	learn: 0.0354164	total: 1.05s	remaining: 25.8ms
488:	learn: 0.0352671	total: 1.05s	remaining: 23.7ms
489:	learn: 0.0351130	total: 1.05s	remaining: 21.5ms
490:	learn: 0.0349866	total: 1.05s	remaining: 19.3ms
491:	learn: 0.0348900	total: 1.06s	remaining: 17.2ms
492:	learn: 0.0347980	total: 1.06s	remaining: 15ms
493:	learn: 0.0347149	total: 1.06s	remaining: 12.9ms
494:	learn: 0.0345927	total: 1.06s	remaining: 10.7ms
495:	learn: 0.0344583	total: 1.06s	remaining: 8.59ms
496:	learn: 0.0343711	total: 1.07s	remaining: 6.44ms
497:	learn: 0.0342721	total: 1.07s	remaining: 4.29ms
498:	learn: 0.0341411	total: 1.07s	remaining: 2.15ms
499:	learn: 0.0340330	total: 1.07s	remaining: 0us
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0
模型测试集Precision:0.9111111111111111,Recall:0.9111111111111111,F1_score:0.9111111111111111,Accuracy:0.9166666666666666
-------------------测试集混淆举证-------------------
No description has been provided for this image
In [ ]:
#计算PR曲线和ROC曲线值
CatBoost_PR,CatBoost_Accuracy=PR_Curve(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)    
CatBoost_ROC,CatBoost_AUC=ROC_Curve(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
CatBoost_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/CatBoost_PR.xlsx','UTF-8')
CatBoost_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/CatBoost_ROC.xlsx','UTF-8')
In [26]:
#绘制PR曲线
import matplotlib.pyplot as plt
PR_curve=plt.figure(dpi=300)
PR_ax=PR_curve.add_subplot(111)
PR_ax.set_title('Precision-Recall curve')
PR_ax.plot(CatBoost_PR['recall'], CatBoost_PR['precision'],color='red',label='CatBoost=%f'%CatBoost_Accuracy)
PR_ax.plot(DTC_PR['recall'], DTC_PR['precision'],color='green',label='DecisionTree=%f'%DTC_Accuracy)
PR_ax.plot(LightGBM_PR['recall'], LightGBM_PR['precision'],color='blue',label='LightGBM=%f'%LightGBM_Accuracy)
PR_ax.plot(Logistic_PR['recall'], Logistic_PR['precision'],color='yellow',label='Logistic=%f'%Logistic_Accuracy)
PR_ax.plot(RondomForest_PR['recall'], RondomForest_PR['precision'],color='purple',label='RondomForest=%f'%RondomForest_Accuracy)
PR_ax.plot(SVM_PR['recall'], SVM_PR['precision'],color='cyan',label='SVM=%f'%SVM_Accuracy)
PR_ax.plot(XGBoost_PR['recall'], XGBoost_PR['precision'],color='pink',label='XGBoost=%f'%XGBoost_Accuracy)
PR_ax.plot([0,1],[1,0],linestyle='-.',color='black')
PR_ax.set_xlabel('Recall')  
PR_ax.set_ylabel('Precision')    
plt.legend(loc="best")  
plt.show()
No description has been provided for this image
In [27]:
#绘制ROC曲线
import matplotlib.pyplot as plt
ROC_curve=plt.figure(dpi=300)
ROC_ax=ROC_curve.add_subplot(111)
ROC_ax.set_title('ROC Curve')
ROC_ax.plot(CatBoost_ROC['fpr'], CatBoost_ROC['tpr'],color='red',label='CatBoost=%f'%CatBoost_AUC)
ROC_ax.plot(DTC_ROC['fpr'], DTC_ROC['tpr'],color='green',label='DecisionTree=%f'%DTC_AUC)
ROC_ax.plot(LightGBM_ROC['fpr'], LightGBM_ROC['tpr'],color='blue',label='LightGBM=%f'%LightGBM_AUC)
ROC_ax.plot(Logistic_ROC['fpr'], Logistic_ROC['tpr'],color='yellow',label='Logistic=%f'%Logistic_AUC)
ROC_ax.plot(RondomForest_ROC['fpr'], RondomForest_ROC['tpr'],color='purple',label='RondomForest=%f'%RondomForest_AUC)
ROC_ax.plot(SVM_ROC['fpr'], SVM_ROC['tpr'],color='cyan',label='SVM=%f'%SVM_AUC)
ROC_ax.plot(XGBoost_ROC['fpr'], XGBoost_ROC['tpr'],color='pink',label='XGBoost=%f'%XGBoost_AUC)
ROC_ax.plot([0,1],[0,1],linestyle='-.',color='black')
ROC_ax.set_xlim([-0.05, 1.0])  
ROC_ax.set_ylim([0, 1.05])  
ROC_ax.set_xlabel('FPR')  
ROC_ax.set_ylabel('TPR')    
plt.legend(loc="best")  
plt.show()
No description has been provided for this image
In [28]:
#保存模型-输入[batchs,61],输出[batch,1]
import joblib
MLModel=[Logistic,SVM_Classifier,Best_DTC,Best_Forest,XGBoost,LightGBM,CatBoost]
MLModelStr=['Logistic.pkl','SVM.pkl','DecisionTree.pkl','Forest.pkl','XGBoost.pkl','LightGBM.pkl','CatBoost.pkl']
try:
    for model,modelstr in zip(MLModel,MLModelStr):
        joblib.dump(model,'/mnt/workspace/Analysis Model/Meachine Learning Models/'+modelstr)
    print('模型保存成功!')
except:
    print('模型保存异常!!!')
模型保存成功!
In [29]:
#执行PCA算法-计算保留95%信息时的维度
from sklearn.decomposition import PCA
PCA=PCA(n_components=0.95)
Methylation_95=PCA.fit_transform(MLRunData)    #数据降维
Methylation_95.shape
Out[29]:
(318, 22)
In [31]:
#构建SHAP库解析数据集
from sklearn.model_selection import train_test_split
TrainDataSHAP,TestDataSHAP,TrainLabelSHAP,TestLabelSHAP=train_test_split(MLRunData,ReMLLabel,train_size=0.7,random_state=2025)  
In [32]:
#载入SHAP库计算SHAP值
import shap
SVMExplainer=shap.KernelExplainer(SVM_Classifier.predict, TestDataSHAP)
shap_values=SVMExplainer.shap_values(TestDataSHAP)
100%|██████████| 96/96 [06:58<00:00,  4.36s/it]
In [33]:
#绘制Summaryplot图
shap.summary_plot(shap_values,TestDataSHAP,max_display=22)
shap.summary_plot(shap_values,TestDataSHAP,max_display=22,plot_type='bar')
No description has been provided for this image
No description has been provided for this image
In [34]:
#绘制SHAP图
SHAP_ResFormer=(TestDataSHAP)
100%|██████████| 96/96 [07:01<00:00,  4.39s/it]
In [35]:
shap.plots.heatmap(SHAP_SVM,max_display=23)
No description has been provided for this image
Out[35]:
<Axes: xlabel='Instances'>
In [36]:
#绘制Decision图
expected_value=SVMExplainer.expected_value
shap.decision_plot(expected_value,shap_values,TestDataSHAP.columns,feature_display_range=slice(None, -23, -1))
No description has been provided for this image
In [37]:
#绘制SHAP散点图
shap.plots.scatter(SHAP_SVM[:,'cg00581848'])
shap.plots.scatter(SHAP_SVM[:,'cg00581848'],color=SHAP_SVM)
No description has been provided for this image
No description has been provided for this image
In [38]:
#绘制SHAP散点图-性别特异性
shap.plots.scatter(SHAP_SVM[:,'GenderEncoder'])
shap.plots.scatter(SHAP_SVM[:,'GenderEncoder'],color=SHAP_SVM)
No description has been provided for this image
No description has been provided for this image
In [ ]:
#输出SHAP值
SHAPData=DataFrame(shap_values)
SHAPData.to_excel('/mnt/workspace/Analysis Data/SVM_SHAP_Data.xlsx','UFT-8')
SHAPData.to_csv('/mnt/workspace/Analysis Data/SVM_SHAP_Data.csv')

使用LIME(Local Interpretable Model-agnostic Explanations)对SVM模型进行模型解释性。

In [ ]:
#转化为Numpy数组
import numpy as np
TestDataLIME=np.array(TestDataSHAP)
TestDataLIME
In [103]:
#构建LIME解释器
from lime.lime_tabular import LimeTabularExplainer
SVM_LIMEExplainer=LimeTabularExplainer(training_data=TestDataLIME,mode='classification',feature_names=TestDataSHAP.columns,class_names=[0,1], discretize_continuous=True)
In [104]:
#提取类风湿性关节炎患者和健康人数据——为数据生成解释
LIMEData=pd.concat([TestDataSHAP,TestLabelSHAP],axis=1)    #合并数据
Contral_Data=LIMEData.loc[LIMEData.loc[:,'DiseaseEncoder']==0,:]    #正常组数据
Contral_DataSample=Contral_Data.iloc[0,:-1]
RA_Data=LIMEData.loc[LIMEData.loc[:,'DiseaseEncoder']==1,:]    #正常组数据
RA_DataSample=RA_Data.iloc[0,:-1]
In [105]:
#为健康人生成可解释性
Contral_Explain=SVM_LIMEExplainer.explain_instance(Contral_DataSample,SVM_Classifier.predict_proba,num_features=22)
#可视化解释结果
Contral_Explain.show_in_notebook(show_table=True, show_all=False)
Contral_Explain.as_pyplot_figure()
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:544: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  binary_column = (inverse_column == first_row[column]).astype(int)
/usr/local/lib/python3.11/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but SVC was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:427: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  discretized_instance[f])]
Out[105]:
No description has been provided for this image
No description has been provided for this image
In [106]:
#为类风湿性关节炎患者生成可解释性
RA_Explain=SVM_LIMEExplainer.explain_instance(RA_DataSample,SVM_Classifier.predict_proba,num_features=22)
#可视化解释结果
RA_Explain.show_in_notebook(show_table=True, show_all=False)
RA_Explain.as_pyplot_figure()
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:544: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  binary_column = (inverse_column == first_row[column]).astype(int)
/usr/local/lib/python3.11/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but SVC was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`
  ret[feature] = int(self.lambdas[feature](ret[feature]))
/usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:427: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  discretized_instance[f])]
Out[106]:
No description has been provided for this image
No description has been provided for this image

提取SHAP方法和LIME方法提取的特征数据做交集获得候选特征。

In [111]:
#获取交 集数据
SHAP_Feature=['cg00581848','cg05544622','cg00478198','cg01938887','cg17482649','cg00455876','cg11704979','cg01515508',
             'cg05257372','cg22221554','cg00423014','cg22561883','cg10315562','cg26012731','cg03601619','cg26039926',
             'cg02714462','cg08141049','cg05443523','cg19532714','age','cg07066594']      #SHAP框架提取特征
LIME_Feature=['cg07066594','cg00581848','cg17482649','cg00455876','cg05544622','cg00423014','cg01515508','cg05443523',
             'cg08141049','cg26039926','cg00342358','cg23925558','cg16078210','cg00306390','cg00776430','cg01938887',
             'cg00543485','cg22561883','cg02896361','cg17488844','cg05257372','cg12944030']    #LIME提取特征
MethylationRA=set(SHAP_Feature)&set(LIME_Feature)    #计算交集
MethylationRA=list(MethylationRA)+['age','gender','disease','GenderEncoder','DiseaseEncoder']
MethylationRA
Out[111]:
['cg05544622',
 'cg01938887',
 'cg07066594',
 'cg22561883',
 'cg00581848',
 'cg05443523',
 'cg05257372',
 'cg00455876',
 'cg17482649',
 'cg26039926',
 'cg00423014',
 'cg01515508',
 'cg08141049',
 'age',
 'gender',
 'disease',
 'GenderEncoder',
 'DiseaseEncoder']

对交集DNA甲基化位点数据进行统计学分析,主要包括:正态检验、差异分析和相关分析。

In [112]:
#获取统计学数据
StatsData=MethylationData.loc[:,MethylationRA]
StatsData.shape
Out[112]:
(318, 18)
In [136]:
#首先进行正态性分析
from scipy.stats import normaltest
CGList=['cg05544622','cg01938887','cg07066594','cg22561883','cg00581848','cg05443523','cg05257372','cg00455876',
                         'cg17482649','cg26039926','cg00423014','cg01515508','cg08141049','age']
NormalCG,NormalStats,NormalPvalue=[],[],[]
for cg in CGList:
    CGData=StatsData.loc[:,cg]    #提取数据
    normalstats,normalpvalue=normaltest(CGData)    #正态检验
    NormalCG.append(cg)
    NormalStats.append(normalstats)
    NormalPvalue.append(normalpvalue)
NormalResult=DataFrame()
NormalResult['Feature']=NormalCG
NormalResult['Normal Stats']=NormalStats
NormalResult['Normal P value']=NormalPvalue
NormalResult=NormalResult.sort_values('Normal P value',ascending=True)
NormalResult
Out[136]:
Feature Normal Stats Normal P value
3 cg22561883 45397.441686 0.000000e+00
0 cg05544622 748.223251 3.352629e-163
12 cg08141049 388.753626 3.830210e-85
8 cg17482649 234.045003 1.505821e-51
9 cg26039926 230.466573 9.011967e-51
7 cg00455876 204.595818 3.737512e-45
5 cg05443523 169.865790 1.300506e-37
11 cg01515508 153.367553 4.973465e-34
10 cg00423014 148.633679 5.304039e-33
2 cg07066594 122.553661 2.442366e-27
6 cg05257372 87.312124 1.097522e-19
4 cg00581848 76.637329 2.282525e-17
1 cg01938887 8.991284 1.115751e-02
13 age 4.452384 1.079387e-01
In [137]:
#输出正态检验结果
NormalResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/NormalResult.xlsx','UTF-8')
NormalResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/NormalResult.csv')
In [138]:
#查看数据分布
import seaborn as sns
for cg in CGList:
    kde=plt.figure()
    sns.distplot(StatsData.loc[:,cg])
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [139]:
#以是否发生类风湿性关节炎分组进行差异分析
#使用Mann-Whitney U和Kruskal-Wallis H进行差异分析
from scipy.stats import mannwhitneyu,kruskal
Diff_name,Mann_W_stats,Mann_W_pvalue=[],[],[]
KW_Stats,KW_Pvalue=[],[]
RA=StatsData.loc[StatsData.loc[:,'disease']=='rheumatoid arthritis',:]
Control=StatsData.loc[StatsData.loc[:,'disease']=='control',:]
for name in CGList:
    mann_stats,mann_pvalue=mannwhitneyu(RA.loc[:,name],Control.loc[:,name],alternative='two-sided')
    Diff_name.append(name)
    Mann_W_stats.append(mann_stats)
    Mann_W_pvalue.append(mann_pvalue)
    kw_stats,kw_pvalue=kruskal(RA.loc[:,name],Control.loc[:,name])
    KW_Stats.append(kw_stats)
    KW_Pvalue.append(kw_pvalue)
DiffResult=DataFrame()
DiffResult['CG']=Diff_name
DiffResult['Mann-Whitney U Stats']=Mann_W_stats
DiffResult['Mann-Whitney U Pvalue']=Mann_W_pvalue
DiffResult['Kruskal-Wallis H Stat']=KW_Stats
DiffResult['Kruskal-Wallis H Pvalue']=KW_Pvalue
DiffResult=DiffResult.sort_values('Mann-Whitney U Pvalue',ascending=True)
DiffResult
Out[139]:
CG Mann-Whitney U Stats Mann-Whitney U Pvalue Kruskal-Wallis H Stat Kruskal-Wallis H Pvalue
12 cg08141049 18348.5 1.423253e-18 77.375583 1.413519e-18
9 cg26039926 5909.5 2.189873e-16 67.433837 2.178774e-16
6 cg05257372 6207.5 3.556541e-15 61.940912 3.539194e-15
7 cg00455876 8289.5 2.437285e-14 58.156458 2.420782e-14
4 cg00581848 16820.0 6.371911e-13 51.741242 6.331863e-13
8 cg17482649 9202.0 1.038787e-09 37.261518 1.033031e-09
10 cg00423014 7932.5 2.694328e-09 35.400669 2.683947e-09
11 cg01515508 17054.5 7.118968e-08 29.038656 7.094835e-08
5 cg05443523 15647.0 4.901541e-07 25.310599 4.880197e-07
2 cg07066594 16273.5 3.926378e-06 21.306322 3.914385e-06
0 cg05544622 9963.0 1.647483e-04 14.200984 1.642846e-04
1 cg01938887 11455.0 1.482944e-01 2.091472 1.481233e-01
3 cg22561883 11780.5 2.944031e-01 1.100655 2.941223e-01
13 age 13486.5 3.022940e-01 1.065306 3.020081e-01
In [140]:
#输出差异分析结果
DiffResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/DiffResult.xlsx','UTF-8')
DiffResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/DiffResult.csv')
In [135]:
StatsData
Out[135]:
cg05544622 cg01938887 cg07066594 cg22561883 cg00581848 cg05443523 cg05257372 cg00455876 cg17482649 cg26039926 cg00423014 cg01515508 cg08141049 age gender disease GenderEncoder DiseaseEncoder
0 0.000000 -1.271175 -1.348848 -0.176421 -2.616463 -2.441207 3.938986 0.000000 0.000000 -2.749976 1.202370 3.701353 -0.866223 19.0000 F control 1.0 0
1 0.000000 -1.130596 -1.495729 0.347383 -2.468664 -1.536363 3.938986 0.000000 0.900040 -3.052934 1.019899 3.543689 -1.265340 12.0000 F control 1.0 0
2 0.000000 -1.168907 -0.553611 -0.196592 0.000000 -1.071863 3.580953 0.000000 0.856647 -3.228904 0.241113 1.557086 -0.575243 71.8192 F control 1.0 0
3 4.178048 -2.921730 -3.201956 -3.701353 -2.732410 0.000000 3.472874 0.510719 0.000000 -2.664729 1.839820 4.178048 0.000000 23.0000 M control 0.0 0
4 0.000000 -1.456115 -2.131436 0.011998 0.000000 0.000000 3.938986 0.000000 0.372163 -2.131436 0.837604 3.938986 -1.635580 84.0000 F control 1.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
313 0.000000 -1.798190 -1.236447 -0.019997 0.000000 0.000000 3.580953 0.000000 0.000000 -2.963760 0.000000 4.402578 0.000000 34.0000 F rheumatoid arthritis 1.0 1
314 3.659672 -4.112908 0.000000 -4.247583 0.000000 0.000000 0.000000 0.000000 0.000000 -2.253116 0.000000 3.993781 0.000000 51.0000 M rheumatoid arthritis 0.0 1
315 0.000000 -2.163468 0.000000 -0.180452 0.000000 0.000000 3.343970 0.000000 0.000000 -3.343970 1.130596 3.472874 0.000000 45.0000 F rheumatoid arthritis 1.0 1
316 0.000000 -1.578093 0.000000 -0.088039 0.000000 0.000000 3.256540 0.000000 0.000000 -2.767818 0.000000 4.178048 0.000000 31.0000 F rheumatoid arthritis 1.0 1
317 0.000000 -1.141464 -0.895180 -0.224898 0.000000 0.000000 3.149987 0.000000 0.000000 -3.076403 1.672671 4.940737 0.000000 58.0000 F rheumatoid arthritis 1.0 1

318 rows × 18 columns

In [152]:
#绘制特异性数据箱型图
for names in CGList:
    plt.figure(dpi=300)
    sns.boxplot(StatsData,x='disease',y=names,showmeans=True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [141]:
#以是类风湿性关节炎性别分组进行差异分析
#使用Mann-Whitney U和Kruskal-Wallis H进行差异分析
from scipy.stats import mannwhitneyu,kruskal
SexDiff_name,SexMann_W_stats,SexMann_W_pvalue=[],[],[]
SexKW_Stats,SexKW_Pvalue=[],[]
Man=RA.loc[RA.loc[:,'gender']=='M',:]
Feman=RA.loc[RA.loc[:,'gender']=='F',:]
for name in CGList:
    mann_stats,mann_pvalue=mannwhitneyu(Man.loc[:,name],Feman.loc[:,name],alternative='two-sided')
    SexDiff_name.append(name)
    SexMann_W_stats.append(mann_stats)
    SexMann_W_pvalue.append(mann_pvalue)
    kw_stats,kw_pvalue=kruskal(Man.loc[:,name],Feman.loc[:,name])
    SexKW_Stats.append(kw_stats)
    SexKW_Pvalue.append(kw_pvalue)
SexDiffResult=DataFrame()
SexDiffResult['CG']=SexDiff_name
SexDiffResult['Mann-Whitney U Stats']=SexMann_W_stats
SexDiffResult['Mann-Whitney U Pvalue']=SexMann_W_pvalue
SexDiffResult['Kruskal-Wallis H Stat']=SexKW_Stats
SexDiffResult['Kruskal-Wallis H Pvalue']=SexKW_Pvalue
SexDiffResult=DiffResult.sort_values('Mann-Whitney U Pvalue',ascending=True)
SexDiffResult
Out[141]:
CG Mann-Whitney U Stats Mann-Whitney U Pvalue Kruskal-Wallis H Stat Kruskal-Wallis H Pvalue
12 cg08141049 18348.5 1.423253e-18 77.375583 1.413519e-18
9 cg26039926 5909.5 2.189873e-16 67.433837 2.178774e-16
6 cg05257372 6207.5 3.556541e-15 61.940912 3.539194e-15
7 cg00455876 8289.5 2.437285e-14 58.156458 2.420782e-14
4 cg00581848 16820.0 6.371911e-13 51.741242 6.331863e-13
8 cg17482649 9202.0 1.038787e-09 37.261518 1.033031e-09
10 cg00423014 7932.5 2.694328e-09 35.400669 2.683947e-09
11 cg01515508 17054.5 7.118968e-08 29.038656 7.094835e-08
5 cg05443523 15647.0 4.901541e-07 25.310599 4.880197e-07
2 cg07066594 16273.5 3.926378e-06 21.306322 3.914385e-06
0 cg05544622 9963.0 1.647483e-04 14.200984 1.642846e-04
1 cg01938887 11455.0 1.482944e-01 2.091472 1.481233e-01
3 cg22561883 11780.5 2.944031e-01 1.100655 2.941223e-01
13 age 13486.5 3.022940e-01 1.065306 3.020081e-01
In [142]:
#输出差异分析结果
SexDiffResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/SexDiffResult.xlsx','UTF-8')
SexDiffResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/SexDiffResult.csv')
In [153]:
#绘制特异性数据箱型图
for names in CGList:
    plt.figure(dpi=300)
    sns.boxplot(StatsData,x='gender',y=names,showmeans=True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [143]:
#绘制箱型图
for cg_name in CGList:
    box=plt.figure(dpi=300)
    sns.boxplot(StatsData,x='disease',y=cg_name,hue='gender')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

计算相关性:年龄及其性别数据(cg07066594)的相关性。

In [156]:
#计算相关性
Spearman=StatsData.loc[:,CGList].corr(method='spearman')
plt.figure(dpi=300)
sns.clustermap(Spearman,annot=True)
plt.show()
<Figure size 1920x1440 with 0 Axes>
No description has been provided for this image
In [163]:
#绘制散点图查看数据分布
Scatter=plt.figure(dpi=300)
StatterAx=Scatter.add_subplot()
StatterAx.set_title('Scatterplot of methylation data')
scatter_plot=StatterAx.scatter(x=StatsData.loc[:,'cg00581848'],y=StatsData.loc[:,'cg05544622'],c=StatsData.loc[:,'cg07066594'],cmap='jet')
StatterAx.set_xlabel('cg00581848')
StatterAx.set_ylabel('cg05544622')
Scatter.colorbar(scatter_plot)
plt.show()
No description has been provided for this image
In [164]:
#绘制散点图查看数据分布
Scatter=plt.figure(dpi=300)
StatterAx=Scatter.add_subplot()
StatterAx.set_title('Scatterplot of methylation data')
scatter_plot=StatterAx.scatter(x=StatsData.loc[:,'cg00581848'],y=StatsData.loc[:,'cg05544622'],c=StatsData.loc[:,'age'],cmap='jet')
StatterAx.set_xlabel('cg00581848')
StatterAx.set_ylabel('cg05544622')
Scatter.colorbar(scatter_plot)
plt.show()
No description has been provided for this image
In [165]:
#输出统计学结果
StatsData.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/StatsRawData.xlsx','UTF-8')
StatsData.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/StatsRawData.csv')
In [ ]: